From 79a0d9c3cced449cafe843d6b0a51df75edb2881 Mon Sep 17 00:00:00 2001 From: Misha Gutman Date: Thu, 24 Oct 2024 12:23:42 -0700 Subject: [PATCH] Removed unused GlobalAvergaPooling internal code. PiperOrigin-RevId: 689475112 --- BUILD.bazel | 1 - CMakeLists.txt | 7 - build_srcs.bzl | 1 - cmake/gen/f16c_microkernels.cmake | 8 - cmake/gen/neon_microkernels.cmake | 34 - cmake/gen/neonfp16arith_microkernels.cmake | 8 - cmake/gen/neonv8_microkernels.cmake | 16 - cmake/gen/rvv_microkernels.cmake | 6 - cmake/gen/scalar_microkernels.cmake | 38 - cmake/gen/sse2_microkernels.cmake | 12 - cmake/gen/sse41_microkernels.cmake | 12 - cmake/gen/sse_microkernels.cmake | 2 - cmake/gen/wasm_microkernels.cmake | 2 - cmake/gen/wasmsimd_microkernels.cmake | 20 - gen/f16c_microkernels.bzl | 8 - gen/neon_microkernels.bzl | 34 - gen/neonfp16arith_microkernels.bzl | 8 - gen/neonv8_microkernels.bzl | 16 - gen/rvv_microkernels.bzl | 6 - gen/scalar_microkernels.bzl | 38 - gen/sse2_microkernels.bzl | 12 - gen/sse41_microkernels.bzl | 12 - gen/sse_microkernels.bzl | 2 - gen/wasm_microkernels.bzl | 2 - gen/wasmsimd_microkernels.bzl | 20 - scripts/generate-f16-gavgpool.sh | 29 - scripts/generate-f32-gavgpool.sh | 16 - scripts/generate-qs8-gavgpool.sh | 171 - scripts/generate-tests.sh | 8 - src/configs/gavgpool-config.c | 308 - .../gen/f16-gavgpool-7p7x-minmax-f16c-c16.c | 300 - .../gen/f16-gavgpool-7p7x-minmax-f16c-c24.c | 349 - .../gen/f16-gavgpool-7p7x-minmax-f16c-c32.c | 398 - .../gen/f16-gavgpool-7p7x-minmax-f16c-c8.c | 197 - ...6-gavgpool-7p7x-minmax-neonfp16arith-c16.c | 290 - ...6-gavgpool-7p7x-minmax-neonfp16arith-c24.c | 339 - ...6-gavgpool-7p7x-minmax-neonfp16arith-c32.c | 388 - ...16-gavgpool-7p7x-minmax-neonfp16arith-c8.c | 189 - .../gen/f16-gavgpool-7x-minmax-f16c-c16.c | 166 - .../gen/f16-gavgpool-7x-minmax-f16c-c24.c | 183 - .../gen/f16-gavgpool-7x-minmax-f16c-c32.c | 200 - .../gen/f16-gavgpool-7x-minmax-f16c-c8.c | 135 - ...f16-gavgpool-7x-minmax-neonfp16arith-c16.c | 143 - ...f16-gavgpool-7x-minmax-neonfp16arith-c24.c | 160 - ...f16-gavgpool-7x-minmax-neonfp16arith-c32.c | 177 - .../f16-gavgpool-7x-minmax-neonfp16arith-c8.c | 120 - src/f16-gavgpool/multipass-f16c.c.in | 218 - src/f16-gavgpool/multipass-neonfp16arith.c.in | 205 - src/f16-gavgpool/unipass-f16c.c.in | 152 - src/f16-gavgpool/unipass-neonfp16arith.c.in | 123 - .../f32-gavgpool-7p7x-minmax-neon-c4.c | 184 - .../f32-gavgpool-7p7x-minmax-scalar-c1.c | 148 - .../f32-gavgpool-7p7x-minmax-sse-c4.c | 211 - .../f32-gavgpool-7p7x-minmax-wasm-c1.c | 148 - ...f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c | 209 - ...f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c | 209 - .../f32-gavgpool-7x-minmax-neon-c4.c | 112 - .../f32-gavgpool-7x-minmax-scalar-c1.c | 78 - .../f32-gavgpool-7x-minmax-sse-c4.c | 124 - .../f32-gavgpool-7x-minmax-wasm-c1.c | 78 - .../f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c | 121 - .../f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c | 121 - .../gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c | 150 - .../gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c | 150 - .../gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c | 150 - .../gen/f32-gavgpool-7x-minmax-rvv-c1v.c | 79 - .../gen/f32-gavgpool-7x-minmax-rvv-c2v.c | 79 - .../gen/f32-gavgpool-7x-minmax-rvv-c4v.c | 79 - src/f32-gavgpool/rvv_7p7x.c.in | 147 - src/f32-gavgpool/rvv_7x.c.in | 76 - src/microparams-init.c | 138 - src/operator-run.c | 87 - .../qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c | 315 - .../qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c | 437 - .../qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c | 500 - .../qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c | 246 - ...qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c | 310 - ...qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c | 431 - ...qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c | 493 - .../qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c | 242 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c | 155 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c | 261 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c | 367 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c1.c | 156 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c2.c | 265 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c4.c | 373 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c | 155 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c | 261 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c | 367 - .../qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c | 432 - .../qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c | 631 - .../qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c | 335 - .../qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c | 348 - .../qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c | 493 - .../qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c | 276 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c | 350 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c | 492 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c | 556 - ...s8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c | 278 - .../qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c | 311 - .../qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c | 432 - .../qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c | 494 - .../qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c | 243 - .../qs8-gavgpool-7x-minmax-fp32-neon-c16.c | 199 - .../qs8-gavgpool-7x-minmax-fp32-neon-c24.c | 229 - .../qs8-gavgpool-7x-minmax-fp32-neon-c32.c | 254 - .../gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c | 168 - .../qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c | 194 - .../qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c | 223 - .../qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c | 247 - .../qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c | 164 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c | 88 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c | 147 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c | 189 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c1.c | 89 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c2.c | 151 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c4.c | 195 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c | 88 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c | 147 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c | 189 - .../qs8-gavgpool-7x-minmax-fp32-sse2-c16.c | 252 - .../qs8-gavgpool-7x-minmax-fp32-sse2-c24.c | 289 - .../gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c | 209 - .../qs8-gavgpool-7x-minmax-fp32-sse41-c16.c | 212 - .../qs8-gavgpool-7x-minmax-fp32-sse41-c24.c | 241 - .../qs8-gavgpool-7x-minmax-fp32-sse41-c8.c | 178 - ...qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c | 211 - ...qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c | 240 - ...qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c | 266 - .../qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c | 177 - .../qs8-gavgpool-7x-minmax-rndnu-neon-c16.c | 195 - .../qs8-gavgpool-7x-minmax-rndnu-neon-c24.c | 224 - .../qs8-gavgpool-7x-minmax-rndnu-neon-c32.c | 248 - .../qs8-gavgpool-7x-minmax-rndnu-neon-c8.c | 165 - src/qs8-gavgpool/multipass-neon.c.in | 423 - src/qs8-gavgpool/multipass-scalar.c.in | 319 - src/qs8-gavgpool/multipass-sse2.c.in | 392 - src/qs8-gavgpool/multipass-sse4.c.in | 389 - src/qs8-gavgpool/multipass-wasmsimd.c.in | 365 - src/qs8-gavgpool/unipass-neon.c.in | 278 - src/qs8-gavgpool/unipass-scalar.c.in | 230 - src/qs8-gavgpool/unipass-sse2.c.in | 218 - src/qs8-gavgpool/unipass-sse4.c.in | 211 - src/qs8-gavgpool/unipass-wasmsimd.c.in | 203 - .../qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c | 315 - .../qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c | 437 - .../qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c | 500 - .../qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c | 246 - ...qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c | 310 - ...qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c | 431 - ...qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c | 493 - .../qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c | 242 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c | 155 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c | 261 - ...vgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c | 367 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c1.c | 156 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c2.c | 265 - ...vgpool-7p7x-minmax-fp32-scalar-imagic-c4.c | 373 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c | 155 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c | 261 - ...vgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c | 367 - .../qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c | 425 - .../qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c | 619 - .../qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c | 332 - .../qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c | 351 - .../qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c | 496 - .../qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c | 279 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c | 350 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c | 492 - ...8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c | 556 - ...u8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c | 278 - .../qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c | 311 - .../qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c | 432 - .../qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c | 494 - .../qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c | 243 - .../qu8-gavgpool-7x-minmax-fp32-neon-c16.c | 199 - .../qu8-gavgpool-7x-minmax-fp32-neon-c24.c | 229 - .../qu8-gavgpool-7x-minmax-fp32-neon-c32.c | 254 - .../gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c | 168 - .../qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c | 194 - .../qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c | 223 - .../qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c | 247 - .../qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c | 164 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c | 88 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c | 147 - ...gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c | 189 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c1.c | 89 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c2.c | 151 - ...gavgpool-7x-minmax-fp32-scalar-imagic-c4.c | 195 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c | 88 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c | 147 - ...gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c | 189 - .../qu8-gavgpool-7x-minmax-fp32-sse2-c16.c | 249 - .../qu8-gavgpool-7x-minmax-fp32-sse2-c24.c | 285 - .../gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c | 208 - .../qu8-gavgpool-7x-minmax-fp32-sse41-c16.c | 213 - .../qu8-gavgpool-7x-minmax-fp32-sse41-c24.c | 242 - .../qu8-gavgpool-7x-minmax-fp32-sse41-c8.c | 179 - ...qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c | 211 - ...qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c | 240 - ...qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c | 266 - .../qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c | 177 - .../qu8-gavgpool-7x-minmax-rndnu-neon-c16.c | 195 - .../qu8-gavgpool-7x-minmax-rndnu-neon-c24.c | 224 - .../qu8-gavgpool-7x-minmax-rndnu-neon-c32.c | 248 - .../qu8-gavgpool-7x-minmax-rndnu-neon-c8.c | 165 - src/xnnpack/compute.h | 61 - src/xnnpack/config-types.h | 42 - src/xnnpack/config.h | 5 - src/xnnpack/gavgpool.h | 341 - src/xnnpack/microfnptr.h | 146 - src/xnnpack/microparams-init.h | 36 - src/xnnpack/microparams.h | 41 +- src/xnnpack/operator.h | 13 - test/BUILD.bazel | 54 - test/average-pooling-nhwc.cc | 954 +- test/f16-gavgpool-minmax.cc | 3403 ----- test/f16-gavgpool-minmax.yaml | 40 - test/f32-gavgpool-minmax.cc | 3285 ----- test/f32-gavgpool-minmax.yaml | 53 - test/gavgpool-microkernel-tester.h | 651 - test/qs8-gavgpool-minmax-fp32.cc | 10520 ---------------- test/qs8-gavgpool-minmax-fp32.yaml | 117 - test/qs8-gavgpool-minmax-rndnu.cc | 1711 --- test/qs8-gavgpool-minmax-rndnu.yaml | 22 - test/qu8-gavgpool-minmax-fp32.cc | 10520 ---------------- test/qu8-gavgpool-minmax-fp32.yaml | 117 - test/qu8-gavgpool-minmax-rndnu.cc | 1711 --- test/qu8-gavgpool-minmax-rndnu.yaml | 22 - tools/generate-gavgpool-test.py | 767 -- tools/generate-rdsum-benchmark.py | 4 +- 231 files changed, 480 insertions(+), 79479 deletions(-) delete mode 100755 scripts/generate-f16-gavgpool.sh delete mode 100755 scripts/generate-f32-gavgpool.sh delete mode 100755 scripts/generate-qs8-gavgpool.sh delete mode 100644 src/configs/gavgpool-config.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c delete mode 100644 src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c delete mode 100644 src/f16-gavgpool/multipass-f16c.c.in delete mode 100644 src/f16-gavgpool/multipass-neonfp16arith.c.in delete mode 100644 src/f16-gavgpool/unipass-f16c.c.in delete mode 100644 src/f16-gavgpool/unipass-neonfp16arith.c.in delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c delete mode 100644 src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c delete mode 100755 src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c delete mode 100755 src/f32-gavgpool/rvv_7p7x.c.in delete mode 100755 src/f32-gavgpool/rvv_7x.c.in delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c delete mode 100644 src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c delete mode 100644 src/qs8-gavgpool/multipass-neon.c.in delete mode 100644 src/qs8-gavgpool/multipass-scalar.c.in delete mode 100644 src/qs8-gavgpool/multipass-sse2.c.in delete mode 100644 src/qs8-gavgpool/multipass-sse4.c.in delete mode 100644 src/qs8-gavgpool/multipass-wasmsimd.c.in delete mode 100644 src/qs8-gavgpool/unipass-neon.c.in delete mode 100644 src/qs8-gavgpool/unipass-scalar.c.in delete mode 100644 src/qs8-gavgpool/unipass-sse2.c.in delete mode 100644 src/qs8-gavgpool/unipass-sse4.c.in delete mode 100644 src/qs8-gavgpool/unipass-wasmsimd.c.in delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c delete mode 100644 src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c delete mode 100644 src/xnnpack/gavgpool.h delete mode 100644 test/f16-gavgpool-minmax.cc delete mode 100644 test/f16-gavgpool-minmax.yaml delete mode 100644 test/f32-gavgpool-minmax.cc delete mode 100644 test/f32-gavgpool-minmax.yaml delete mode 100644 test/gavgpool-microkernel-tester.h delete mode 100644 test/qs8-gavgpool-minmax-fp32.cc delete mode 100644 test/qs8-gavgpool-minmax-fp32.yaml delete mode 100644 test/qs8-gavgpool-minmax-rndnu.cc delete mode 100644 test/qs8-gavgpool-minmax-rndnu.yaml delete mode 100644 test/qu8-gavgpool-minmax-fp32.cc delete mode 100644 test/qu8-gavgpool-minmax-fp32.yaml delete mode 100644 test/qu8-gavgpool-minmax-rndnu.cc delete mode 100644 test/qu8-gavgpool-minmax-rndnu.yaml delete mode 100755 tools/generate-gavgpool-test.py diff --git a/BUILD.bazel b/BUILD.bazel index b8525a1e36c..12ed5539f19 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -221,7 +221,6 @@ MICROKERNEL_HDRS = [ "src/xnnpack/conv.h", "src/xnnpack/dwconv.h", "src/xnnpack/fill.h", - "src/xnnpack/gavgpool.h", "src/xnnpack/gemm.h", "src/xnnpack/ibilinear.h", "src/xnnpack/igemm.h", diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b26c530497..5694c7ac6ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -507,7 +507,6 @@ SET(XNNPACK_SRCS src/configs/dwconv-config.c src/configs/dwconv2d-chw-config.c src/configs/experiments-config.c - src/configs/gavgpool-config.c src/configs/gemm-config.c src/configs/ibilinear-chw-config.c src/configs/ibilinear-config.c @@ -1475,7 +1474,6 @@ IF(XNNPACK_BUILD_TESTS) f16-conv-hwc2chw f16-f32acc-rdsum f16-f32acc-rsum - f16-gavgpool-minmax f16-ibilinear-chw f16-ibilinear f16-raddstoreexpminusmax @@ -1486,7 +1484,6 @@ IF(XNNPACK_BUILD_TESTS) f16-vmulcaddc-minmax f32-conv-hwc f32-conv-hwc2chw - f32-gavgpool-minmax f32-ibilinear-chw f32-ibilinear f32-raddexpminusmax @@ -1503,16 +1500,12 @@ IF(XNNPACK_BUILD_TESTS) f32-vscaleextexp indirection packing - qs8-gavgpool-minmax-fp32 - qs8-gavgpool-minmax-rndnu qs8-rdsum-minmax-fp32 qu8-rdsum qs8-rsum qu8-rsum qs8-vhswish qs8-vlrelu - qu8-gavgpool-minmax-fp32 - qu8-gavgpool-minmax-rndnu qu8-vhswish qu8-vlrelu s8-ibilinear diff --git a/build_srcs.bzl b/build_srcs.bzl index 53d0491e94e..407d13b0186 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -108,7 +108,6 @@ XNNPACK_SRCS = [ "src/configs/conv-hwc2chw-config.c", "src/configs/dwconv-config.c", "src/configs/dwconv2d-chw-config.c", - "src/configs/gavgpool-config.c", "src/configs/gemm-config.c", "src/configs/ibilinear-chw-config.c", "src/configs/ibilinear-config.c", diff --git a/cmake/gen/f16c_microkernels.cmake b/cmake/gen/f16c_microkernels.cmake index 1d48cc491de..b21994aac03 100644 --- a/cmake/gen/f16c_microkernels.cmake +++ b/cmake/gen/f16c_microkernels.cmake @@ -15,8 +15,6 @@ SET(PROD_F16C_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c src/f16-rminmax/f16-rmax-f16c-u32.c src/f16-vbinary/gen/f16-vadd-f16c-u16.c @@ -60,12 +58,6 @@ SET(NON_PROD_F16C_MICROKERNEL_SRCS src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c src/f16-vbinary/gen/f16-vadd-f16c-u8.c src/f16-vbinary/gen/f16-vaddc-f16c-u8.c src/f16-vbinary/gen/f16-vdiv-f16c-u16.c diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index a9a6d6a7ae3..d946008330e 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -27,8 +27,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4.c src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4.c src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-u8.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c src/f32-gemm/gen/f32-gemm-1x8-minmax-neon-lane-ld64.c src/f32-gemm/gen/f32-gemm-4x2-minmax-neon-lane-ld64.c src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld128.c @@ -100,8 +98,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mla8-ld64.c src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld64.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neon-mla8-ld128.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p16c-minmax-fp32-neon-mla8-ld64.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-neon-mla8-ld64.c @@ -125,8 +121,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul8.c src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul8.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c src/qu8-gemm/gen/qu8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c src/qu8-gemm/gen/qu8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c @@ -571,20 +565,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u8.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-neon-mla8-ld64.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neon-mla8-ld64.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p8c-minmax-fp32-neon-mla8-ld64.c @@ -785,20 +765,6 @@ SET(NON_PROD_NEON_MICROKERNEL_SRCS src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u8.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c src/qu8-gemm/gen/qu8-gemm-1x8-minmax-fp32-neon-mlal-lane.c src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neon-mlal-lane.c src/qu8-gemm/gen/qu8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c diff --git a/cmake/gen/neonfp16arith_microkernels.cmake b/cmake/gen/neonfp16arith_microkernels.cmake index 62dbe4a7a37..e621ffa3d4b 100644 --- a/cmake/gen/neonfp16arith_microkernels.cmake +++ b/cmake/gen/neonfp16arith_microkernels.cmake @@ -24,8 +24,6 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8.c src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c src/f16-gemm/gen/f16-gemm-1x8-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/f16-gemm-1x16-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/f16-gemm-6x8-minmax-neonfp16arith-ld64.c @@ -170,12 +168,6 @@ SET(NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c - src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c - src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c src/f16-gemm/gen/f16-gemm-4x8-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/f16-gemm-4x16-minmax-neonfp16arith-ld64.c src/f16-gemm/gen/f16-gemm-8x8-minmax-neonfp16arith-ld64.c diff --git a/cmake/gen/neonv8_microkernels.cmake b/cmake/gen/neonv8_microkernels.cmake index b82e6ca5f85..055300508d6 100644 --- a/cmake/gen/neonv8_microkernels.cmake +++ b/cmake/gen/neonv8_microkernels.cmake @@ -53,14 +53,6 @@ SET(NON_PROD_NEONV8_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-neonv8-mla8-ld64.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld64.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-neonv8-mla8-ld64.c @@ -200,14 +192,6 @@ SET(NON_PROD_NEONV8_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c src/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c src/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 5d13ebb9fcd..602adea680e 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -15,8 +15,6 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-argmaxpool/f32-argmaxpool-9x-rvv-u1v.c src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c2v.c src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c2v.c - src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c - src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c src/f32-gemm/gen/f32-gemm-1x4v-minmax-rvv.c src/f32-gemm/gen/f32-gemm-7x4v-minmax-rvv.c src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c @@ -73,10 +71,6 @@ SET(PROD_RVV_MICROKERNEL_SRCS SET(NON_PROD_RVV_MICROKERNEL_SRCS src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c1v.c src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c1v.c - src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c - src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c - src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c - src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c src/f32-gemm/gen/f32-gemm-1x4v-relu-rvv.c src/f32-gemm/gen/f32-gemm-1x4v-rvv.c src/f32-gemm/gen/f32-gemm-7x4v-relu-rvv.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 30082cad98f..9f8a6f6637c 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -40,8 +40,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc2.c src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-u4.c src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-u2.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c src/f32-gemm/gen/f32-gemm-1x4-minmax-scalar.c src/f32-gemm/gen/f32-gemm-1x4-relu-scalar.c src/f32-gemm/gen/f32-gemm-1x4-scalar.c @@ -156,10 +154,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u1.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u4.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c src/qs8-packw/gen/qs8-packw-x64c4-gemm-goi-scalar.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c @@ -201,10 +195,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u1.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u4.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-imagic.c src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-lrintf.c src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-imagic.c @@ -617,20 +607,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u2.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u3.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c src/qs8-packw/gen/qs8-packw-x8c4-gemm-goi-scalar.c src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-scalar.c src/qs8-packw/gen/qs8-packw-x16c4-gemm-goi-scalar.c @@ -780,20 +756,6 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u2.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u3.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-fmagic.c src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-lrintf.c src/qu8-gemm/gen/qu8-gemm-1x2-minmax-rndnu-scalar.c diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake index 74c2269e8b7..1fe70ed3454 100644 --- a/cmake/gen/sse2_microkernels.cmake +++ b/cmake/gen/sse2_microkernels.cmake @@ -49,8 +49,6 @@ SET(PROD_SSE2_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-sse2-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8c-minmax-fp32-sse2-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-sse2-mul16.c @@ -70,8 +68,6 @@ SET(PROD_SSE2_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse2-mul16.c src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse2-mul16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld64.c @@ -206,10 +202,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u8.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse2-mul16-add16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse2-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l16c8s8r-minmax-fp32-sse2-mul16-add16.c @@ -293,10 +285,6 @@ SET(NON_PROD_SSE2_MICROKERNEL_SRCS src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u8.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld64.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld128.c src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c diff --git a/cmake/gen/sse41_microkernels.cmake b/cmake/gen/sse41_microkernels.cmake index ed8c2b0ba25..b8c9f227a7b 100644 --- a/cmake/gen/sse41_microkernels.cmake +++ b/cmake/gen/sse41_microkernels.cmake @@ -32,8 +32,6 @@ SET(PROD_SSE41_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-sse41-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8c-minmax-fp32-sse41-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-sse41-mul16.c @@ -52,8 +50,6 @@ SET(PROD_SSE41_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul16.c src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld64.c @@ -202,10 +198,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u8.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u24.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c4s4r-minmax-fp32-sse41-mul32.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse41-mul16-add16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse41-mul16.c @@ -322,10 +314,6 @@ SET(NON_PROD_SSE41_MICROKERNEL_SRCS src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u8.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u24.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld64.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld128.c src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c diff --git a/cmake/gen/sse_microkernels.cmake b/cmake/gen/sse_microkernels.cmake index b8777e138f4..51a20d61816 100644 --- a/cmake/gen/sse_microkernels.cmake +++ b/cmake/gen/sse_microkernels.cmake @@ -22,8 +22,6 @@ SET(PROD_SSE_MICROKERNEL_SRCS src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc3.c src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4.c src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c src/f32-gemm/gen/f32-gemm-1x8-minmax-sse-load1.c src/f32-gemm/gen/f32-gemm-4x2c4-minmax-sse.c src/f32-gemm/gen/f32-gemm-4x8-minmax-sse-load1.c diff --git a/cmake/gen/wasm_microkernels.cmake b/cmake/gen/wasm_microkernels.cmake index f60129cee2c..f809939d3f5 100644 --- a/cmake/gen/wasm_microkernels.cmake +++ b/cmake/gen/wasm_microkernels.cmake @@ -17,8 +17,6 @@ SET(PROD_WASM_MICROKERNEL_SRCS src/f32-dwconv/gen/f32-dwconv-5f5m5l1c1s1r-minmax-wasm-acc2.c src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-wasm-acc2.c src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-wasm-acc2.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c src/f32-gemm/gen/f32-gemm-1x4-minmax-wasm.c src/f32-gemm/gen/f32-gemm-1x4-relu-wasm.c src/f32-gemm/gen/f32-gemm-4x2-minmax-wasm.c diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index b8d29cb8180..947bea7c842 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -48,10 +48,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc2.c src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc2.c src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-u24.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c - src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c - src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-arm-splat.c src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-splat.c @@ -175,8 +171,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-wasmsimd-mul16-add16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c @@ -197,8 +191,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16.c src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c @@ -875,12 +867,6 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u8.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u16.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c - src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-wasmsimd-mul16-add16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-wasmsimd-mul16.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l16c8s8r-minmax-fp32-wasmsimd-mul16-add16.c @@ -977,12 +963,6 @@ SET(NON_PROD_WASMSIMD_MICROKERNEL_SRCS src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u8.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u16.c src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c - src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c diff --git a/gen/f16c_microkernels.bzl b/gen/f16c_microkernels.bzl index 5e2a145eb6c..9f11c329db1 100644 --- a/gen/f16c_microkernels.bzl +++ b/gen/f16c_microkernels.bzl @@ -11,8 +11,6 @@ PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c", "src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-c32.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c", "src/f16-maxpool/f16-maxpool-9p8x-minmax-f16c-c8.c", "src/f16-rminmax/f16-rmax-f16c-u32.c", "src/f16-vbinary/gen/f16-vadd-f16c-u16.c", @@ -57,12 +55,6 @@ NON_PROD_F16C_MICROKERNEL_SRCS = [ "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u16-acc2.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u24-acc3.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc2.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c", "src/f16-vbinary/gen/f16-vadd-f16c-u8.c", "src/f16-vbinary/gen/f16-vaddc-f16c-u8.c", "src/f16-vbinary/gen/f16-vdiv-f16c-u16.c", diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index 80e0b69fa88..dc9f8351c06 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -23,8 +23,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-neon-1x4.c", "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-neon-1x4.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-neon-u8.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c", "src/f32-gemm/gen/f32-gemm-1x8-minmax-neon-lane-ld64.c", "src/f32-gemm/gen/f32-gemm-4x2-minmax-neon-lane-ld64.c", "src/f32-gemm/gen/f32-gemm-4x8-minmax-neon-lane-ld128.c", @@ -96,8 +94,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-rndnu-neon-mla8-ld64.c", "src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-rndnu-neon-mla8-ld64.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neon-mla8-ld128.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p16c-minmax-fp32-neon-mla8-ld64.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-neon-mla8-ld64.c", @@ -121,8 +117,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-9p16c-minmax-rndnu-neon-mul8.c", "src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-rndnu-neon-mul8.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c", "src/qu8-gemm/gen/qu8-gemm-1x8-minmax-rndnu-neon-mlal-lane.c", "src/qu8-gemm/gen/qu8-gemm-1x16-minmax-rndnu-neon-mlal-lane.c", "src/qu8-gemm/gen/qu8-gemm-3x8-minmax-rndnu-neon-mlal-lane.c", @@ -568,20 +562,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u8.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-neon-u24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-neon-mla8-ld64.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neon-mla8-ld64.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p8c-minmax-fp32-neon-mla8-ld64.c", @@ -782,20 +762,6 @@ NON_PROD_NEON_MICROKERNEL_SRCS = [ "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u8.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-neon-u24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c", "src/qu8-gemm/gen/qu8-gemm-1x8-minmax-fp32-neon-mlal-lane.c", "src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neon-mlal-lane.c", "src/qu8-gemm/gen/qu8-gemm-2x8-minmax-rndnu-neon-mlal-lane.c", diff --git a/gen/neonfp16arith_microkernels.bzl b/gen/neonfp16arith_microkernels.bzl index 3477376ceb5..2225525fbe8 100644 --- a/gen/neonfp16arith_microkernels.bzl +++ b/gen/neonfp16arith_microkernels.bzl @@ -20,8 +20,6 @@ PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8.c", "src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-c16.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c", "src/f16-gemm/gen/f16-gemm-1x8-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/f16-gemm-1x16-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/f16-gemm-6x8-minmax-neonfp16arith-ld64.c", @@ -167,12 +165,6 @@ NON_PROD_NEONFP16ARITH_MICROKERNEL_SRCS = [ "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u16-acc2.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u24-acc3.c", "src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc2.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c", - "src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c", - "src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c", "src/f16-gemm/gen/f16-gemm-4x8-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/f16-gemm-4x16-minmax-neonfp16arith-ld64.c", "src/f16-gemm/gen/f16-gemm-8x8-minmax-neonfp16arith-ld64.c", diff --git a/gen/neonv8_microkernels.bzl b/gen/neonv8_microkernels.bzl index ee23ef3c77f..4adca66c48e 100644 --- a/gen/neonv8_microkernels.bzl +++ b/gen/neonv8_microkernels.bzl @@ -50,14 +50,6 @@ NON_PROD_NEONV8_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c", "src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c", "src/qs8-dwconv/gen/qs8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-neonv8-mla8-ld64.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-neonv8-mla8-ld64.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-neonv8-mla8-ld64.c", @@ -197,14 +189,6 @@ NON_PROD_NEONV8_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-neonv8-mul16.c", "src/qu8-dwconv/gen/qu8-dwconv-25p16c-minmax-fp32-neonv8-mul16.c", "src/qu8-dwconv/gen/qu8-dwconv-25p32c-minmax-fp32-neonv8-mul16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c", "src/qu8-gemm/gen/qu8-gemm-1x16-minmax-fp32-neonv8-mlal-lane.c", "src/qu8-gemm/gen/qu8-gemm-4x16-minmax-fp32-neonv8-mlal-lane.c", "src/qu8-igemm/gen/qu8-igemm-1x16-minmax-fp32-neonv8-mlal-lane.c", diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 8790b58a8e4..8d86489dbb2 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -11,8 +11,6 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-argmaxpool/f32-argmaxpool-9x-rvv-u1v.c", "src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c2v.c", "src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c2v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c", "src/f32-gemm/gen/f32-gemm-1x4v-minmax-rvv.c", "src/f32-gemm/gen/f32-gemm-7x4v-minmax-rvv.c", "src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c", @@ -70,10 +68,6 @@ PROD_RVV_MICROKERNEL_SRCS = [ NON_PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c1v.c", "src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c1v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c", - "src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c", "src/f32-gemm/gen/f32-gemm-1x4v-relu-rvv.c", "src/f32-gemm/gen/f32-gemm-1x4v-rvv.c", "src/f32-gemm/gen/f32-gemm-7x4v-relu-rvv.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index dc83dabaae6..cb01215b8cd 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -36,8 +36,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-scalar-2x1-acc2.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-bitcast-u4.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-scalar-fabsf-u2.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c", "src/f32-gemm/gen/f32-gemm-1x4-minmax-scalar.c", "src/f32-gemm/gen/f32-gemm-1x4-relu-scalar.c", "src/f32-gemm/gen/f32-gemm-1x4-scalar.c", @@ -152,10 +150,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u1.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u4.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c", "src/qs8-packw/gen/qs8-packw-x64c4-gemm-goi-scalar.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p1c-minmax-fp32-scalar-fmagic.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p2c-minmax-fp32-scalar-imagic.c", @@ -197,10 +191,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-25p2c-minmax-fp32-scalar-lrintf.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u1.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u4.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c", "src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-imagic.c", "src/qu8-gemm/gen/qu8-gemm-1x4-minmax-fp32-scalar-lrintf.c", "src/qu8-gemm/gen/qu8-gemm-2x2-minmax-fp32-scalar-imagic.c", @@ -614,20 +604,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u2.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u3.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c", "src/qs8-packw/gen/qs8-packw-x8c4-gemm-goi-scalar.c", "src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-scalar.c", "src/qs8-packw/gen/qs8-packw-x16c4-gemm-goi-scalar.c", @@ -777,20 +753,6 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u2.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-scalar-u3.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c", "src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-fmagic.c", "src/qu8-gemm/gen/qu8-gemm-1x2-minmax-fp32-scalar-lrintf.c", "src/qu8-gemm/gen/qu8-gemm-1x2-minmax-rndnu-scalar.c", diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl index 74279c53b4a..34f5a067f4a 100644 --- a/gen/sse2_microkernels.bzl +++ b/gen/sse2_microkernels.bzl @@ -45,8 +45,6 @@ PROD_SSE2_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse2-mul16-add16.c", "src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse2-mul16-add16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-sse2-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8c-minmax-fp32-sse2-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-sse2-mul16.c", @@ -66,8 +64,6 @@ PROD_SSE2_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse2-mul16.c", "src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse2-mul16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c", "src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse2-ld64.c", "src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse2-ld64.c", "src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse2-ld64.c", @@ -203,10 +199,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u8.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse2-u24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse2-mul16-add16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse2-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l16c8s8r-minmax-fp32-sse2-mul16-add16.c", @@ -290,10 +282,6 @@ NON_PROD_SSE2_MICROKERNEL_SRCS = [ "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u8.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse2-u24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld64.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse2-ld128.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse2-ld64.c", diff --git a/gen/sse41_microkernels.bzl b/gen/sse41_microkernels.bzl index 623f97f31c9..03e207f8848 100644 --- a/gen/sse41_microkernels.bzl +++ b/gen/sse41_microkernels.bzl @@ -28,8 +28,6 @@ PROD_SSE41_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-9p8c-minmax-fp32-sse41-mul16-add16.c", "src/qs8-dwconv/gen/qs8-dwconv-25p8c-minmax-fp32-sse41-mul16-add16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8c-minmax-fp32-sse41-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8c-minmax-fp32-sse41-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8c-minmax-fp32-sse41-mul16.c", @@ -48,8 +46,6 @@ PROD_SSE41_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-sse41-mul16.c", "src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-sse41-mul16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c", "src/qu8-gemm/gen/qu8-gemm-1x4c8-minmax-fp32-sse41-ld64.c", "src/qu8-gemm/gen/qu8-gemm-3x4c8-minmax-fp32-sse41-ld64.c", "src/qu8-igemm/gen/qu8-igemm-1x4c8-minmax-fp32-sse41-ld64.c", @@ -199,10 +195,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [ "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u8.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u24.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-sse41-u32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c4s4r-minmax-fp32-sse41-mul32.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse41-mul16-add16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-sse41-mul16.c", @@ -319,10 +311,6 @@ NON_PROD_SSE41_MICROKERNEL_SRCS = [ "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u8.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u24.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-sse41-u32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld64.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-sse41-ld128.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-sse41-ld64.c", diff --git a/gen/sse_microkernels.bzl b/gen/sse_microkernels.bzl index d27e5296e19..87f8eec2de0 100644 --- a/gen/sse_microkernels.bzl +++ b/gen/sse_microkernels.bzl @@ -18,8 +18,6 @@ PROD_SSE_MICROKERNEL_SRCS = [ "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-3x3s2p1-minmax-sse-1x4-acc3.c", "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5p2-minmax-sse-4x4.c", "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-sse-2x4.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c", "src/f32-gemm/gen/f32-gemm-1x8-minmax-sse-load1.c", "src/f32-gemm/gen/f32-gemm-4x2c4-minmax-sse.c", "src/f32-gemm/gen/f32-gemm-4x8-minmax-sse-load1.c", diff --git a/gen/wasm_microkernels.bzl b/gen/wasm_microkernels.bzl index a5f1b10aeae..c5061be9acf 100644 --- a/gen/wasm_microkernels.bzl +++ b/gen/wasm_microkernels.bzl @@ -13,8 +13,6 @@ PROD_WASM_MICROKERNEL_SRCS = [ "src/f32-dwconv/gen/f32-dwconv-5f5m5l1c1s1r-minmax-wasm-acc2.c", "src/f32-dwconv/gen/f32-dwconv-9p1c-minmax-wasm-acc2.c", "src/f32-dwconv/gen/f32-dwconv-25p1c-minmax-wasm-acc2.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c", "src/f32-gemm/gen/f32-gemm-1x4-minmax-wasm.c", "src/f32-gemm/gen/f32-gemm-1x4-relu-wasm.c", "src/f32-gemm/gen/f32-gemm-4x2-minmax-wasm.c", diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index 39965b5bb9e..dd60c1e931b 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -44,10 +44,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc2.c", "src/f32-dwconv2d-chw/gen/f32-dwconv2d-chw-5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc2.c", "src/f32-f16-vcvt/gen/f32-f16-vcvt-wasmsimd-u24.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c", - "src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c", - "src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c", "src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-arm-splat.c", "src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-loadsplat.c", "src/f32-gemm/gen/f32-gemm-1x8-minmax-wasmsimd-x86-splat.c", @@ -171,8 +167,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c", "src/qs8-dwconv/gen/qs8-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p16c-minmax-fp32-wasmsimd-mul16-add16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p16c-minmax-fp32-wasmsimd-mul16-add16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p16c-minmax-fp32-wasmsimd-mul16-add16.c", @@ -193,8 +187,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/qu8-dwconv/gen/qu8-dwconv-9p8c-minmax-fp32-wasmsimd-mul16.c", "src/qu8-dwconv/gen/qu8-dwconv-25p8c-minmax-fp32-wasmsimd-mul16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c", "src/qu8-gemm/gen/qu8-gemm-4x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c", "src/qu8-igemm/gen/qu8-igemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld128.c", @@ -872,12 +864,6 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u8.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u16.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-wasmsimd-u24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c", - "src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-wasmsimd-mul16-add16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l8c8s8r-minmax-fp32-wasmsimd-mul16.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l16c8s8r-minmax-fp32-wasmsimd-mul16-add16.c", @@ -974,12 +960,6 @@ NON_PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u8.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u16.c", "src/qu8-f32-vcvt/gen/qu8-f32-vcvt-wasmsimd-u24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c", - "src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c", "src/qu8-gemm/gen/qu8-gemm-1x4c2s4-minmax-fp32-wasmsimd-dot16x2-ld64.c", diff --git a/scripts/generate-f16-gavgpool.sh b/scripts/generate-f16-gavgpool.sh deleted file mode 100755 index 5b9a1ed39f0..00000000000 --- a/scripts/generate-f16-gavgpool.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/sh -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################## ARM NEON ################################### -tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c & -tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c & -tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c & -tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c & - -tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c & -tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c & -tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c & -tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c & - -################################### x86 F16C ################################### -tools/xngen src/f16-gavgpool/unipass-f16c.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c & -tools/xngen src/f16-gavgpool/unipass-f16c.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c & -tools/xngen src/f16-gavgpool/unipass-f16c.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c & -tools/xngen src/f16-gavgpool/unipass-f16c.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c & - -tools/xngen src/f16-gavgpool/multipass-f16c.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c & -tools/xngen src/f16-gavgpool/multipass-f16c.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c & -tools/xngen src/f16-gavgpool/multipass-f16c.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c & -tools/xngen src/f16-gavgpool/multipass-f16c.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c & - -wait diff --git a/scripts/generate-f32-gavgpool.sh b/scripts/generate-f32-gavgpool.sh deleted file mode 100755 index f3cb9b2d0d3..00000000000 --- a/scripts/generate-f32-gavgpool.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -# Copyright 2024 Imagination Technologies, inc. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################ RISC-V Vector ################################ -tools/xngen src/f32-gavgpool/rvv_7x.c.in -D LMUL=1 -o src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c & -tools/xngen src/f32-gavgpool/rvv_7x.c.in -D LMUL=2 -o src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c & -tools/xngen src/f32-gavgpool/rvv_7x.c.in -D LMUL=4 -o src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c & - -tools/xngen src/f32-gavgpool/rvv_7p7x.c.in -D LMUL=1 -o src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c & -tools/xngen src/f32-gavgpool/rvv_7p7x.c.in -D LMUL=2 -o src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c & -tools/xngen src/f32-gavgpool/rvv_7p7x.c.in -D LMUL=4 -o src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c & - -wait diff --git a/scripts/generate-qs8-gavgpool.sh b/scripts/generate-qs8-gavgpool.sh deleted file mode 100755 index 80a918c1621..00000000000 --- a/scripts/generate-qs8-gavgpool.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/bin/sh -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -################################### Scalar #################################### -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c & - -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c & - -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c & - -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c & - -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c & - -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c & -tools/xngen src/qs8-gavgpool/unipass-scalar.c.in -D ROW_TILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QS8 -D WASM=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=FMAGIC -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c & - -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=1 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=2 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c & -tools/xngen src/qs8-gavgpool/multipass-scalar.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=4 -D REQUANTIZATION=FP32 -D VARIANT=LRINTF -D DATATYPE=QU8 -D WASM=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c & - -################################## ARM NEON ################################### -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c & -tools/xngen src/qs8-gavgpool/unipass-neon.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=1 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=RNDNU -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=1 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c & -tools/xngen src/qs8-gavgpool/multipass-neon.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=RNDNU -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c & - -################################## WAsm SIMD ################################## -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c & - -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c & -tools/xngen src/qs8-gavgpool/unipass-wasmsimd.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c & - -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c & -tools/xngen src/qs8-gavgpool/multipass-wasmsimd.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c & - -################################### x86 SSE ################################### -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c & -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c & -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c & - -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c & -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c & -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c & - -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c & -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c & -tools/xngen src/qs8-gavgpool/unipass-sse2.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c & - -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c & -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c & -tools/xngen src/qs8-gavgpool/unipass-sse4.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c & - -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c & -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c & -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c & - -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c & -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c & -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -o src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c & - -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c & -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c & -tools/xngen src/qs8-gavgpool/multipass-sse2.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c & - -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c & -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c & -tools/xngen src/qs8-gavgpool/multipass-sse4.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -o src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c & - -wait diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index 3ffc01f4221..de5302fdba6 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -4,14 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -### Tests for GAvgPool micro-kernels -tools/generate-gavgpool-test.py --spec test/f16-gavgpool-minmax.yaml --output test/f16-gavgpool-minmax.cc & -tools/generate-gavgpool-test.py --spec test/f32-gavgpool-minmax.yaml --output test/f32-gavgpool-minmax.cc & -tools/generate-gavgpool-test.py --spec test/qs8-gavgpool-minmax-fp32.yaml --output test/qs8-gavgpool-minmax-fp32.cc & -tools/generate-gavgpool-test.py --spec test/qs8-gavgpool-minmax-rndnu.yaml --output test/qs8-gavgpool-minmax-rndnu.cc & -tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-fp32.yaml --output test/qu8-gavgpool-minmax-fp32.cc & -tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-rndnu.yaml --output test/qu8-gavgpool-minmax-rndnu.cc & - ### Tests for ArgMaxPool micro-kernels tools/generate-argmaxpool-test.py --spec test/f32-argmaxpool.yaml --output test/f32-argmaxpool.cc & diff --git a/src/configs/gavgpool-config.c b/src/configs/gavgpool-config.c deleted file mode 100644 index 1a0495748c6..00000000000 --- a/src/configs/gavgpool-config.c +++ /dev/null @@ -1,308 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/config.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/init-once.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams-init.h" - -static struct xnn_gavgpool_config f16_gavgpool_config = {0}; -static struct xnn_gavgpool_config f32_gavgpool_config = {0}; -static struct xnn_gavgpool_config qs8_gavgpool_config = {0}; -static struct xnn_gavgpool_config qu8_gavgpool_config = {0}; - -XNN_INIT_ONCE_GUARD(f16_gavgpool); -XNN_INIT_ONCE_GUARD(f32_gavgpool); -XNN_INIT_ONCE_GUARD(qs8_gavgpool); -XNN_INIT_ONCE_GUARD(qu8_gavgpool); - -static void init_f16_gavgpool_config(void) { - #if XNN_ARCH_ARM && XNN_ENABLE_ARM_FP16_VECTOR && XNN_ENABLE_ARM_FP16_SCALAR - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon_fp16_arith) { - f16_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8; - f16_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8; - f16_gavgpool_config.init.f16 = xnn_init_f16_scaleminmax_scalar_params; - f16_gavgpool_config.update.f16 = xnn_update_f16_scaleminmax_scalar_params; - f16_gavgpool_config.row_tile = 7; - f16_gavgpool_config.channel_tile = 8; - } - #elif XNN_ARCH_ARM64 && XNN_ENABLE_ARM_FP16_VECTOR - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon_fp16_arith) { - f16_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8; - f16_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8; - f16_gavgpool_config.init.f16 = xnn_init_f16_scaleminmax_scalar_params; - f16_gavgpool_config.update.f16 = xnn_update_f16_scaleminmax_scalar_params; - f16_gavgpool_config.row_tile = 7; - f16_gavgpool_config.channel_tile = 8; - } - #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_x86_f16c) { - f16_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8; - f16_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8; - f16_gavgpool_config.init.f16 = xnn_init_f16_scaleminmax_scalar_params; - f16_gavgpool_config.update.f16 = xnn_update_f16_scaleminmax_scalar_params; - f16_gavgpool_config.row_tile = 7; - f16_gavgpool_config.channel_tile = 8; - } - #endif -} - -static void init_f32_gavgpool_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 4; - } else if (!XNN_PLATFORM_MOBILE) { - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 1; - } - #elif XNN_ARCH_ARM64 - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 4; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 4; - #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->is_x86) { - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 4; - } else { - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 4; - } - #elif XNN_ARCH_WASM - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 1; - #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 16; - #else - f32_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1; - f32_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1; - f32_gavgpool_config.init.f32 = xnn_init_f32_scaleminmax_scalar_params; - f32_gavgpool_config.update.f32 = xnn_update_f32_scaleminmax_scalar_params; - f32_gavgpool_config.row_tile = 7; - f32_gavgpool_config.channel_tile = 1; - #endif -} - -static void init_qs8_gavgpool_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 8; - } else if (!XNN_PLATFORM_MOBILE) { - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 1; - } - #elif XNN_ARCH_ARM64 - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 8; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_x86_sse4_1) { - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 8; - } else { - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 8; - } - #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 16; - #elif XNN_ARCH_WASM - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 4; - #else - qs8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1; - qs8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1; - qs8_gavgpool_config.init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params; - qs8_gavgpool_config.row_tile = 7; - qs8_gavgpool_config.channel_tile = 1; - #endif -} - -static void init_qu8_gavgpool_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 8; - } else if (!XNN_PLATFORM_MOBILE) { - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 1; - } - #elif XNN_ARCH_ARM64 - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 8; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_x86_sse4_1) { - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 8; - } else { - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 8; - } - #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 16; - #elif XNN_ARCH_WASM - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 4; - #else - qu8_gavgpool_config.unipass = (xnn_gavgpool_unipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1; - qu8_gavgpool_config.multipass = (xnn_gavgpool_multipass_ukernel_fn) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1; - qu8_gavgpool_config.init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params; - qu8_gavgpool_config.row_tile = 7; - qu8_gavgpool_config.channel_tile = 1; - #endif -} - -const struct xnn_gavgpool_config* xnn_init_f16_gavgpool_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL || !xnn_is_f16_compatible_config(hardware_config)) { - return NULL; - } - XNN_INIT_ONCE(f16_gavgpool); - return &f16_gavgpool_config; -} - -const struct xnn_gavgpool_config* xnn_init_f32_gavgpool_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(f32_gavgpool); - return &f32_gavgpool_config; -} - -const struct xnn_gavgpool_config* xnn_init_qs8_gavgpool_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(qs8_gavgpool); - return &qs8_gavgpool_config; -} - -const struct xnn_gavgpool_config* xnn_init_qu8_gavgpool_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(qu8_gavgpool); - return &qu8_gavgpool_config; -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c deleted file mode 100644 index 6fdbcb92c95..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c16.c +++ /dev/null @@ -1,300 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 16; c -= 16) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 16; c -= 16) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) (b + 8)); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 16; channels -= 16) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - o += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c deleted file mode 100644 index f454e39c56d..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c24.c +++ /dev/null @@ -1,349 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(vi0xGHIJKLMN, vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) (b + 8)); - __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) (b + 16)); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 24; channels -= 24) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - __m256 voutGHIJKLMN = _mm256_max_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - voutGHIJKLMN = _mm256_min_ps(voutGHIJKLMN, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 16), _mm256_cvtps_ph(voutGHIJKLMN, _MM_FROUND_TO_NEAREST_INT)); - o += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c deleted file mode 100644 index 52155651e88..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c32.c +++ /dev/null @@ -1,398 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi1xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(vi0xGHIJKLMN, vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(vi0xOPQRSTUV, vi1xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi2xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi3xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi4xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi5xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi6xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8; - _mm_store_si128((__m128i*) b, vaccOPQRSTUV); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) (b + 8)); - __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) (b + 16)); - __m128i vaccOPQRSTUV = _mm_loadu_si128((const __m128i*) (b + 24)); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi0xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi1xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi2xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi3xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi4xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi5xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi6xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - _mm_store_si128((__m128i*) b, vacc89ABCDEF); b += 8; - _mm_store_si128((__m128i*) b, vaccGHIJKLMN); b += 8; - _mm_store_si128((__m128i*) b, vaccOPQRSTUV); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 32; channels -= 32) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vacc89ABCDEF = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vaccGHIJKLMN = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - __m128i vaccOPQRSTUV = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi0xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi0x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi0xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi1xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi0xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi1xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi2xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi3xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi4xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi5xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi6xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - __m256 voutGHIJKLMN = _mm256_max_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vmin); - __m256 voutOPQRSTUV = _mm256_max_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - voutGHIJKLMN = _mm256_min_ps(voutGHIJKLMN, vmax); - voutOPQRSTUV = _mm256_min_ps(voutOPQRSTUV, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 16), _mm256_cvtps_ph(voutGHIJKLMN, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) ((uint16_t*) o + 24), _mm256_cvtps_ph(voutOPQRSTUV, _MM_FROUND_TO_NEAREST_INT)); - o += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c deleted file mode 100644 index ea1f88bea2d..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-f16c-c8.c +++ /dev/null @@ -1,197 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b); - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc01234567); b += 8; - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 8; channels -= 8) { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - } - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c deleted file mode 100644 index cb8c83b5d4b..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c16.c +++ /dev/null @@ -1,290 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 16; c -= 16) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 16; c -= 16) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(b + 8)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 16; channels -= 16) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c deleted file mode 100644 index 4ad2a9a489e..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c24.c +++ /dev/null @@ -1,339 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccGHIJKLMN)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(b + 8)); - float16x8_t vaccGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(b + 16)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccGHIJKLMN)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 24; channels -= 24) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vaccGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccGHIJKLMN)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c deleted file mode 100644 index 689c196b9fe..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c32.c +++ /dev/null @@ -1,388 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi2xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi3xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi4xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi5xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - const float16x8_t vi6xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccGHIJKLMN)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccOPQRSTUV)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(b + 8)); - float16x8_t vaccGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(b + 16)); - float16x8_t vaccOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(b + 24)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN); - const float16x8_t vi1xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi2xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi3xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi4xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi5xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - const float16x8_t vi6xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vacc89ABCDEF)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccGHIJKLMN)); b += 8; - vst1q_u16(b, vreinterpretq_u16_f16(vaccOPQRSTUV)); b += 8; - } - if XNN_UNLIKELY(c != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 32; channels -= 32) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vacc89ABCDEF = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vaccGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - float16x8_t vaccOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF); - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN); - const float16x8_t vi1xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi2xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi3xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi4xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi5xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - const float16x8_t vi6xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale); - vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin); - vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax); - vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccGHIJKLMN)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccOPQRSTUV)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c deleted file mode 100644 index 4e9ce3cffdd..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7p7x-minmax-neonfp16arith-c8.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16(b)); - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc01234567)); b += 8; - } - } - - i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 8; channels -= 8) { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - { - float16x8_t vacc01234567 = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567); - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567); - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - } - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c deleted file mode 100644 index 392a6d93a42..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c16.c +++ /dev/null @@ -1,166 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 16; channels -= 16) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8))); - i0 += 16; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8))); - i1 += 16; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8))); - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - i2 += 16; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8))); - i3 += 16; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8))); - i4 += 16; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8))); - i5 += 16; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8))); - i6 += 16; - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - o += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - i2 += 8; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c deleted file mode 100644 index 0ec70b45cb5..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c24.c +++ /dev/null @@ -1,183 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 24; channels -= 24) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8))); - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 16))); - i0 += 24; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8))); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 16))); - i1 += 24; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8))); - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 16))); - __m128i vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(vi0xGHIJKLMN, vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - i2 += 24; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 16))); - i3 += 24; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 16))); - i4 += 24; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 16))); - i5 += 24; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 16))); - i6 += 24; - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - __m256 voutGHIJKLMN = _mm256_max_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - voutGHIJKLMN = _mm256_min_ps(voutGHIJKLMN, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 16), _mm256_cvtps_ph(voutGHIJKLMN, _MM_FROUND_TO_NEAREST_INT)); - o += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - i2 += 8; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c deleted file mode 100644 index 9290f57cad2..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c32.c +++ /dev/null @@ -1,200 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 32; channels -= 32) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - const __m256 vi0x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8))); - const __m256 vi0xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 16))); - const __m256 vi0xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 24))); - i0 += 32; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - const __m256 vi1x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8))); - const __m256 vi1xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 16))); - const __m256 vi1xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 24))); - i1 += 32; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 8))); - __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vi0x89ABCDEF, vi1x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 16))); - __m128i vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(vi0xGHIJKLMN, vi1xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi2xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + 24))); - __m128i vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(vi0xOPQRSTUV, vi1xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - i2 += 32; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi2x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 16))); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi2xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi3xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i3 + 24))); - i3 += 32; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi2xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi3x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 16))); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi3xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i4 + 24))); - i4 += 32; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi3xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi4x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 16))); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi4xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i5 + 24))); - i5 += 32; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi4xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 8))); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi5x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xGHIJKLMN = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 16))); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi5xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6xOPQRSTUV = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i6 + 24))); - i6 += 32; - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi5xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc89ABCDEF), vi6x89ABCDEF), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vi6xGHIJKLMN), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vi6xOPQRSTUV), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccGHIJKLMN = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vscale), _MM_FROUND_TO_NEAREST_INT); - vaccOPQRSTUV = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - __m256 vout89ABCDEF = _mm256_max_ps(_mm256_cvtph_ps(vacc89ABCDEF), vmin); - __m256 voutGHIJKLMN = _mm256_max_ps(_mm256_cvtph_ps(vaccGHIJKLMN), vmin); - __m256 voutOPQRSTUV = _mm256_max_ps(_mm256_cvtph_ps(vaccOPQRSTUV), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - vout89ABCDEF = _mm256_min_ps(vout89ABCDEF, vmax); - voutGHIJKLMN = _mm256_min_ps(voutGHIJKLMN, vmax); - voutOPQRSTUV = _mm256_min_ps(voutOPQRSTUV, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vout89ABCDEF, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 16), _mm256_cvtps_ph(voutGHIJKLMN, _MM_FROUND_TO_NEAREST_INT)); - _mm_storeu_si128((__m128i*) (o + 24), _mm256_cvtps_ph(voutOPQRSTUV, _MM_FROUND_TO_NEAREST_INT)); - o += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - i2 += 8; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c deleted file mode 100644 index 90c9e1fdab5..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-f16c-c8.c +++ /dev/null @@ -1,135 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-f16c.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= 8; channels -= 8) { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - i0 += 8; - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - i1 += 8; - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - i2 += 8; - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - i6 += 8; - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); - const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); - - const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_TO_NEAREST_INT); - - const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_TO_NEAREST_INT); - const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_TO_NEAREST_INT); - vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_TO_NEAREST_INT); - - vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin); - vout01234567 = _mm256_min_ps(vout01234567, vmax); - - __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh01234567); - o += 4; - vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567); - } - if (channels & 2) { - _mm_storeu_si32(o, vh01234567); - o += 2; - vh01234567 = _mm_srli_epi64(vh01234567, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh01234567, 0); - } - } - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c deleted file mode 100644 index 53d6fe0e8c4..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c16.c +++ /dev/null @@ -1,143 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 16; channels -= 16) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c deleted file mode 100644 index 7ff62c4f759..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c24.c +++ /dev/null @@ -1,160 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 24; channels -= 24) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccGHIJKLMN)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c deleted file mode 100644 index 04fcc5edc7b..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c32.c +++ /dev/null @@ -1,177 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 32; channels -= 32) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi0xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - const float16x8_t vi1xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - const float16x8_t vi2x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF); - const float16x8_t vi2xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN); - const float16x8_t vi2xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi3x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF); - const float16x8_t vi3xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN); - const float16x8_t vi3xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi4x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF); - const float16x8_t vi4xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN); - const float16x8_t vi4xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi5x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF); - const float16x8_t vi5xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN); - const float16x8_t vi5xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - const float16x8_t vi6x89ABCDEF = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF); - const float16x8_t vi6xGHIJKLMN = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN); - const float16x8_t vi6xOPQRSTUV = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF); - vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN); - vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale); - vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale); - vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin); - vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin); - vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax); - vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax); - vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc89ABCDEF)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccGHIJKLMN)); output = (xnn_float16*) output + 8; - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vaccOPQRSTUV)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c deleted file mode 100644 index 62bef0319b1..00000000000 --- a/src/f16-gavgpool/gen/f16-gavgpool-7x-minmax-neonfp16arith-c8.c +++ /dev/null @@ -1,120 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in -// Generator: tools/xngen -// -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = (const uint16_t*) zero; - } - const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = (const uint16_t*) zero; - } - const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = (const uint16_t*) zero; - } - const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = (const uint16_t*) zero; - } - const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = (const uint16_t*) zero; - } - const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= 8; channels -= 8) { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - - vacc01234567 = vminq_f16(vacc01234567, vmax); - - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc01234567)); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const float16x8_t vi0x01234567 = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - const float16x8_t vi1x01234567 = vreinterpretq_f16_u16(vld1q_u16(i1)); i1 += 8; - - const float16x8_t vi2x01234567 = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567); - - const float16x8_t vi3x01234567 = vreinterpretq_f16_u16(vld1q_u16(i3)); i3 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567); - const float16x8_t vi4x01234567 = vreinterpretq_f16_u16(vld1q_u16(i4)); i4 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567); - const float16x8_t vi5x01234567 = vreinterpretq_f16_u16(vld1q_u16(i5)); i5 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567); - const float16x8_t vi6x01234567 = vreinterpretq_f16_u16(vld1q_u16(i6)); i6 += 8; - vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567); - vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567); - - vacc01234567 = vmulq_f16(vacc01234567, vscale); - vacc01234567 = vmaxq_f16(vacc01234567, vmin); - vacc01234567 = vminq_f16(vacc01234567, vmax); - - float16x4_t vacc0123 = vget_low_f16(vacc01234567); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123)); output = (xnn_float16*) output + 4; - vacc0123 = vget_high_f16(vacc01234567); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc0123), 0); output = (xnn_float16*) output + 2; - vacc0123 = vext_f16(vacc0123, vacc0123, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc0123), 0); output = (xnn_float16*) output + 1; - } - } - } -} diff --git a/src/f16-gavgpool/multipass-f16c.c.in b/src/f16-gavgpool/multipass-f16c.c.in deleted file mode 100644 index 04aebc586f1..00000000000 --- a/src/f16-gavgpool/multipass-f16c.c.in +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__f16c_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vi${M}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vi2x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8; - __m128i vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vi${M}x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - _mm_store_si128((__m128i*) b, vacc${ABC[C:C+8]}); b += 8; - } - $if CHANNEL_TILE > 8: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(3): - const __m256 vi${M}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); i${M} += 8; - __m128i vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vi${M}x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:8]}); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { - __m128i vacc${ABC[0:8]} = _mm_loadu_si128((const __m128i*) b); - $for C in range(8, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_loadu_si128((const __m128i*) (b + ${C})); - - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vi0x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vi${M}x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - _mm_store_si128((__m128i*) b, vacc${ABC[C:C+8]}); b += 8; - } - $if CHANNEL_TILE > 8: - if XNN_UNLIKELY(c != 0) { - do { - __m128i vacc${ABC[0:8]} = _mm_loadu_si128((const __m128i*) b); - const __m256 vi0x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - $for M in range(ROW_TILE): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vi${M}x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = (const uint16_t*) zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for C in range(0, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - $for C in range(0, CHANNEL_TILE, 8): - const __m256 vi0x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vi${M}x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vscale), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - __m256 vout${ABC[C:C+8]} = _mm256_max_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vmin); - - $for C in range(0, CHANNEL_TILE, 8): - vout${ABC[C:C+8]} = _mm256_min_ps(vout${ABC[C:C+8]}, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); - $for C in range(8, CHANNEL_TILE, 8): - _mm_storeu_si128((__m128i*) ((uint16_t*) o + ${C}), _mm256_cvtps_ph(vout${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT)); - o += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - __m128i vacc${ABC[0:8]} = _mm_loadu_si128((const __m128i*) buffer); buffer = (xnn_float16*) buffer + 8; - - const __m256 vi0x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8; - $for M in range(ROW_TILE): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vi${M}x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout${ABC[0:8]} = _mm256_max_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vmin); - vout${ABC[0:8]} = _mm256_min_ps(vout${ABC[0:8]}, vmax); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh${ABC[0:8]} = _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh${ABC[0:8]}); - o += 4; - vh${ABC[0:8]} = _mm_unpackhi_epi64(vh${ABC[0:8]}, vh${ABC[0:8]}); - } - if (channels & 2) { - _mm_storeu_si32(o, vh${ABC[0:8]}); - o += 2; - vh${ABC[0:8]} = _mm_srli_epi64(vh${ABC[0:8]}, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh${ABC[0:8]}, 0); - } - channels = 0; - } - $else: - __m128i vh${ABC[0:8]} = _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh${ABC[0:8]}); - o += 4; - vh${ABC[0:8]} = _mm_unpackhi_epi64(vh${ABC[0:8]}, vh${ABC[0:8]}); - } - if (channels & 2) { - _mm_storeu_si32(o, vh${ABC[0:8]}); - o += 2; - vh${ABC[0:8]} = _mm_srli_epi64(vh${ABC[0:8]}, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/f16-gavgpool/multipass-neonfp16arith.c.in b/src/f16-gavgpool/multipass-neonfp16arith.c.in deleted file mode 100644 index 1de850cd4ab..00000000000 --- a/src/f16-gavgpool/multipass-neonfp16arith.c.in +++ /dev/null @@ -1,205 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__neonfp16arith_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi${M}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi2x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - vst1q_u16(b, vreinterpretq_u16_f16(vacc${ABC[C:C+8]})); b += 8; - } - $if CHANNEL_TILE > 8: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(3): - const float16x8_t vi${M}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8; - float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc${ABC[0:8]})); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - uint16_t* b = (uint16_t*) buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { - float16x8_t vacc${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(b)); - $for C in range(8, CHANNEL_TILE, 8): - float16x8_t vacc${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(b + ${C})); - - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi0x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - vst1q_u16(b, vreinterpretq_u16_f16(vacc${ABC[C:C+8]})); b += 8; - } - $if CHANNEL_TILE > 8: - if XNN_UNLIKELY(c != 0) { - do { - float16x8_t vacc${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(b)); - const float16x8_t vi0x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - $for M in range(ROW_TILE): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - vst1q_u16(b, vreinterpretq_u16_f16(vacc${ABC[0:8]})); b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint16_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const uint16_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = (const uint16_t*) zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for C in range(0, CHANNEL_TILE, 8): - float16x8_t vacc${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi0x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - - $for M in range(ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax); - - $for C in range(0, CHANNEL_TILE, 8): - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc${ABC[C:C+8]})); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - float16x8_t vacc${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16((const uint16_t*) buffer)); buffer = (xnn_float16*) buffer + 8; - - const float16x8_t vi0x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i0)); i0 += 8; - $for M in range(ROW_TILE): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale); - vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin); - vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc${ABC[0:8]})); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]})); output = (xnn_float16*) output + 4; - vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 2; - vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - $else: - float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]})); output = (xnn_float16*) output + 4; - vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 2; - vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 1; - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/f16-gavgpool/unipass-f16c.c.in b/src/f16-gavgpool/unipass-f16c.c.in deleted file mode 100644 index bd8b5f33b17..00000000000 --- a/src/f16-gavgpool/unipass-f16c.c.in +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}x__f16c_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = (const uint16_t*) zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = (const uint16_t*) zero; - } - uint16_t* o = (uint16_t*) output; - - const __m256 vscale = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.scale)); - const __m256 vmin = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.min)); - const __m256 vmax = _mm256_cvtph_ps(_mm_set1_epi16(*(const uint16_t*) ¶ms->scalar.max)); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - const __m256 vi${M}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m256 vi${M}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - $if C == 0: - const __m256 vi2x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - $else: - const __m256 vi2x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i2 + ${C}))); - __m128i vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - i2 += ${CHANNEL_TILE}; - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - $if C == 0: - const __m256 vi${M+1}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); - $else: - const __m256 vi${M+1}x${ABC[C:C+8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i${M+1} + ${C}))); - $if C + 8 == CHANNEL_TILE: - i${M+1} += ${CHANNEL_TILE}; - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vi${M}x${ABC[C:C+8]}), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vscale), _MM_FROUND_TO_NEAREST_INT); - - $for C in range(0, CHANNEL_TILE, 8): - __m256 vout${ABC[C:C+8]} = _mm256_max_ps(_mm256_cvtph_ps(vacc${ABC[C:C+8]}), vmin); - - $for C in range(0, CHANNEL_TILE, 8): - vout${ABC[C:C+8]} = _mm256_min_ps(vout${ABC[C:C+8]}, vmax); - - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); - $for C in range(8, CHANNEL_TILE, 8): - _mm_storeu_si128((__m128i*) (o + ${C}), _mm256_cvtps_ph(vout${ABC[C:C+8]}, _MM_FROUND_TO_NEAREST_INT)); - o += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const __m256 vi${M}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M})); - $if CHANNEL_TILE > 8: - i${M} += 8; - - const __m256 vi2x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); - __m128i vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - $if CHANNEL_TILE > 8: - i2 += 8; - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const __m256 vi${M+1}x${ABC[0:8]} = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i${M+1})); - $if CHANNEL_TILE > 8: - i${M+1} += 8; - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vi${M}x${ABC[0:8]}), _MM_FROUND_TO_NEAREST_INT); - - vacc${ABC[0:8]} = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vscale), _MM_FROUND_TO_NEAREST_INT); - __m256 vout${ABC[0:8]} = _mm256_max_ps(_mm256_cvtph_ps(vacc${ABC[0:8]}), vmin); - vout${ABC[0:8]} = _mm256_min_ps(vout${ABC[0:8]}, vmax); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT)); - o += 8; - channels -= 8; - } else { - __m128i vh${ABC[0:8]} = _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh${ABC[0:8]}); - o += 4; - vh${ABC[0:8]} = _mm_unpackhi_epi64(vh${ABC[0:8]}, vh${ABC[0:8]}); - } - if (channels & 2) { - _mm_storeu_si32(o, vh${ABC[0:8]}); - o += 2; - vh${ABC[0:8]} = _mm_srli_epi64(vh${ABC[0:8]}, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh${ABC[0:8]}, 0); - } - channels = 0; - } - $else: - __m128i vh${ABC[0:8]} = _mm256_cvtps_ph(vout${ABC[0:8]}, _MM_FROUND_TO_NEAREST_INT); - if (channels & 4) { - _mm_storel_epi64((__m128i*) o, vh${ABC[0:8]}); - o += 4; - vh${ABC[0:8]} = _mm_unpackhi_epi64(vh${ABC[0:8]}, vh${ABC[0:8]}); - } - if (channels & 2) { - _mm_storeu_si32(o, vh${ABC[0:8]}); - o += 2; - vh${ABC[0:8]} = _mm_srli_epi64(vh${ABC[0:8]}, 32); - } - if (channels & 1) { - *o = (uint16_t) _mm_extract_epi16(vh${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/f16-gavgpool/unipass-neonfp16arith.c.in b/src/f16-gavgpool/unipass-neonfp16arith.c.in deleted file mode 100644 index 53b05d7826c..00000000000 --- a/src/f16-gavgpool/unipass-neonfp16arith.c.in +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2022 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}x__neonfp16arith_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const uint16_t* i0 = (const uint16_t*) input; - $for M in range(1, ROW_TILE): - const uint16_t* i${M} = (const uint16_t*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = (const uint16_t*) zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = (const uint16_t*) zero; - } - - const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.scale)); - const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.min)); - const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16((const uint16_t*) ¶ms->scalar.max)); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi${M}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const float16x8_t vi2x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[C:C+8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax); - - $for C in range(0, CHANNEL_TILE, 8): - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc${ABC[C:C+8]})); output = (xnn_float16*) output + 8; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const float16x8_t vi${M}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M})); i${M} += 8; - - const float16x8_t vi2x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i2)); i2 += 8; - float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const float16x8_t vi${M+1}x${ABC[0:8]} = vreinterpretq_f16_u16(vld1q_u16(i${M+1})); i${M+1} += 8; - vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale); - vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin); - vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - vst1q_u16((uint16_t*) output, vreinterpretq_u16_f16(vacc${ABC[0:8]})); output = (xnn_float16*) output + 8; - channels -= 8; - } else { - float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]})); output = (xnn_float16*) output + 4; - vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 2; - vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 1; - } - channels = 0; - } - $else: - float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); - if (channels & 4) { - vst1_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]})); output = (xnn_float16*) output + 4; - vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); - } - if (channels & 2) { - vst1_lane_u32((uint16_t*) output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 2; - vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); - } - if (channels & 1) { - vst1_lane_u16((uint16_t*) output, vreinterpret_u16_f16(vacc${ABC[0:4]}), 0); output = (xnn_float16*) output + 1; - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c deleted file mode 100644 index ada7adc7204..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-neon-c4.c +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t packed_channels = round_up_po2(channels, 4); - const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); - - float* b = buffer; - for (size_t c = 0; c < channels; c += 4) { - const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; - const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; - const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - - const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); - const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); - - const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); - - vst1q_f32(b, vsum); b += 4; - } - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = 0; c < channels; c += 4) { - const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; - const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; - const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; - const float32x4_t vacc = vld1q_f32(b); - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - const float32x4_t vsum6a = vaddq_f32(vi6, vacc); - - const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); - const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); - - const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); - - vst1q_f32(b, vsum); b += 4; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); - const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); - - b = buffer; - while (channels >= 4) { - const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; - const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; - const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; - const float32x4_t vacc = vld1q_f32(b); b += 4; - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - const float32x4_t vsum6a = vaddq_f32(vi6, vacc); - - const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); - const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); - - const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); - - float32x4_t vout = vmulq_f32(vsum, vscale); - vout = vmaxq_f32(vout, vmin); - vout = vminq_f32(vout, vmax); - - vst1q_f32(output, vout); output += 4; - - channels -= 4; - } - if (channels != 0) { - const float32x4_t vi0 = vld1q_f32(i0); - const float32x4_t vi1 = vld1q_f32(i1); - const float32x4_t vi2 = vld1q_f32(i2); - const float32x4_t vi3 = vld1q_f32(i3); - const float32x4_t vi4 = vld1q_f32(i4); - const float32x4_t vi5 = vld1q_f32(i5); - const float32x4_t vi6 = vld1q_f32(i6); - const float32x4_t vacc = vld1q_f32(b); - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - const float32x4_t vsum6a = vaddq_f32(vi6, vacc); - - const float32x4_t vsum0123 = vaddq_f32(vsum01, vsum23); - const float32x4_t vsum456a = vaddq_f32(vsum45, vsum6a); - - const float32x4_t vsum = vaddq_f32(vsum0123, vsum456a); - - float32x4_t vout = vmulq_f32(vsum, vscale); - vout = vmaxq_f32(vout, vmin); - vout = vminq_f32(vout, vmax); - - float32x2_t vout_lo = vget_low_f32(vout); - if (channels & 2) { - vst1_f32(output, vout_lo); output += 2; - vout_lo = vget_high_f32(vout); - } - if (channels & 1) { - vst1_lane_f32(output, vout_lo, 0); - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c deleted file mode 100644 index 2ef646c652c..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-scalar-c1.c +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - size_t c = channels; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - - const float vsum016 = vsum01 + vi6; - const float vsum2345 = vsum23 + vsum45; - - const float vsum = vsum016 + vsum2345; - - *b++ = vsum; - } while (--c != 0); - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - size_t c = channels; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - const float vacc = *b; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - const float vsum6a = vi6 + vacc; - - const float vsum0123 = vsum01 + vsum23; - const float vsum456a = vsum45 + vsum6a; - - const float vsum = vsum0123 + vsum456a; - - *b++ = vsum; - } while (--c != 0); - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float vscale = params->scalar.scale; - const float vmin = params->scalar.min; - const float vmax = params->scalar.max; - - b = buffer; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - const float vacc = *b++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - const float vsum6a = vi6 + vacc; - - const float vsum0123 = vsum01 + vsum23; - const float vsum456a = vsum45 + vsum6a; - - const float vsum = vsum0123 + vsum456a; - - float vout = vsum * vscale; - vout = math_max_f32(vout, vmin); - vout = math_min_f32(vout, vmax); - - *output++ = vout; - } while (--channels != 0); -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c deleted file mode 100644 index 00d4c9244a7..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-sse-c4.c +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t packed_channels = round_up_po2(channels, 4); - const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); - - float* b = buffer; - for (size_t c = 0; c < channels; c += 4) { - const __m128 vi0 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1 = _mm_loadu_ps(i1); - i1 += 4; - const __m128 vi2 = _mm_loadu_ps(i2); - i2 += 4; - const __m128 vi3 = _mm_loadu_ps(i3); - i3 += 4; - const __m128 vi4 = _mm_loadu_ps(i4); - i4 += 4; - const __m128 vi5 = _mm_loadu_ps(i5); - i5 += 4; - const __m128 vi6 = _mm_loadu_ps(i6); - i6 += 4; - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - - const __m128 vsum016 = _mm_add_ps(vsum01, vi6); - const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45); - - const __m128 vsum = _mm_add_ps(vsum016, vsum2345); - - _mm_store_ps(b, vsum); b += 4; - } - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = 0; c < channels; c += 4) { - const __m128 vi0 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1 = _mm_loadu_ps(i1); - i1 += 4; - const __m128 vi2 = _mm_loadu_ps(i2); - i2 += 4; - const __m128 vi3 = _mm_loadu_ps(i3); - i3 += 4; - const __m128 vi4 = _mm_loadu_ps(i4); - i4 += 4; - const __m128 vi5 = _mm_loadu_ps(i5); - i5 += 4; - const __m128 vi6 = _mm_loadu_ps(i6); - i6 += 4; - const __m128 vacc = _mm_load_ps(b); - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - const __m128 vsum6a = _mm_add_ps(vi6, vacc); - - const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23); - const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a); - - const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); - - _mm_store_ps(b, vsum); b += 4; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 vmin = _mm_set1_ps(params->scalar.min); - const __m128 vmax = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - - b = buffer; - while (channels >= 4) { - const __m128 vi0 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1 = _mm_loadu_ps(i1); - i1 += 4; - const __m128 vi2 = _mm_loadu_ps(i2); - i2 += 4; - const __m128 vi3 = _mm_loadu_ps(i3); - i3 += 4; - const __m128 vi4 = _mm_loadu_ps(i4); - i4 += 4; - const __m128 vi5 = _mm_loadu_ps(i5); - i5 += 4; - const __m128 vi6 = _mm_loadu_ps(i6); - i6 += 4; - const __m128 vacc = _mm_load_ps(b); - b += 4; - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - const __m128 vsum6a = _mm_add_ps(vi6, vacc); - - const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23); - const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a); - - const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); - - __m128 vout = _mm_mul_ps(vsum, vscale); - vout = _mm_max_ps(vout, vmin); - vout = _mm_min_ps(vout, vmax); - - _mm_storeu_ps(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const __m128 vi0 = _mm_loadu_ps(i0); - const __m128 vi1 = _mm_loadu_ps(i1); - const __m128 vi2 = _mm_loadu_ps(i2); - const __m128 vi3 = _mm_loadu_ps(i3); - const __m128 vi4 = _mm_loadu_ps(i4); - const __m128 vi5 = _mm_loadu_ps(i5); - const __m128 vi6 = _mm_loadu_ps(i6); - const __m128 vacc = _mm_loadu_ps(b); - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - const __m128 vsum6a = _mm_add_ps(vi6, vacc); - - const __m128 vsum0123 = _mm_add_ps(vsum01, vsum23); - const __m128 vsum456a = _mm_add_ps(vsum45, vsum6a); - - const __m128 vsum = _mm_add_ps(vsum0123, vsum456a); - - __m128 vout = _mm_mul_ps(vsum, vscale); - vout = _mm_max_ps(vout, vmin); - vout = _mm_min_ps(vout, vmax); - - if (channels & 2) { - _mm_storel_pi((__m64*) output, vout); - vout = _mm_movehl_ps(vout, vout); - output += 2; - } - if (channels & 1) { - _mm_store_ss(output, vout); - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c deleted file mode 100644 index 941b9fea322..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasm-c1.c +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - size_t c = channels; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - - const float vsum016 = vsum01 + vi6; - const float vsum2345 = vsum23 + vsum45; - - const float vsum = vsum016 + vsum2345; - - *b++ = vsum; - } while (--c != 0); - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - size_t c = channels; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - const float vacc = *b; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - const float vsum6a = vi6 + vacc; - - const float vsum0123 = vsum01 + vsum23; - const float vsum456a = vsum45 + vsum6a; - - const float vsum = vsum0123 + vsum456a; - - *b++ = vsum; - } while (--c != 0); - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float vscale = params->scalar.scale; - const float vmin = params->scalar.min; - const float vmax = params->scalar.max; - - b = buffer; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - const float vacc = *b++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - const float vsum6a = vi6 + vacc; - - const float vsum0123 = vsum01 + vsum23; - const float vsum456a = vsum45 + vsum6a; - - const float vsum = vsum0123 + vsum456a; - - float vout = vsum * vscale; - vout = __builtin_wasm_max_f32(vout, vmin); - vout = __builtin_wasm_min_f32(vout, vmax); - - *output++ = vout; - } while (--channels != 0); -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c deleted file mode 100644 index 65a49d938a5..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-arm-c4.c +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t packed_channels = round_up_po2(channels, 4); - const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); - - float* b = buffer; - for (size_t c = 0; c < channels; c += 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - wasm_v128_store(b, vsum); - b += 4; - } - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = 0; c < channels; c += 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - const v128_t vacc = wasm_v128_load(b); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - wasm_v128_store(b, vsum); b += 4; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); - const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); - - b = buffer; - while (channels >= 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - const v128_t vacc = wasm_v128_load(b); - b += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_max(vout, vmin); - vout = wasm_f32x4_min(vout, vmax); - - wasm_v128_store(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const v128_t vi0 = wasm_v128_load(i0); - const v128_t vi1 = wasm_v128_load(i1); - const v128_t vi2 = wasm_v128_load(i2); - const v128_t vi3 = wasm_v128_load(i3); - const v128_t vi4 = wasm_v128_load(i4); - const v128_t vi5 = wasm_v128_load(i5); - const v128_t vi6 = wasm_v128_load(i6); - const v128_t vacc = wasm_v128_load(b); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_max(vout, vmin); - vout = wasm_f32x4_min(vout, vmax); - - if (channels & 2) { - wasm_v128_store64_lane(output, vout, 0); - vout = wasm_v64x2_shuffle(vout, vout, 1, 1); - output += 2; - } - if (channels & 1) { - wasm_v128_store32_lane(output, vout, 0); - output += 1; - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c b/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c deleted file mode 100644 index 1a4fd53c876..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7p7x-minmax-wasmsimd-x86-c4.c +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t packed_channels = round_up_po2(channels, 4); - const size_t input_increment = 7 * input_stride - packed_channels * sizeof(float); - - float* b = buffer; - for (size_t c = 0; c < channels; c += 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - wasm_v128_store(b, vsum); - b += 4; - } - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = 0; c < channels; c += 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - const v128_t vacc = wasm_v128_load(b); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - wasm_v128_store(b, vsum); b += 4; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); - const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); - - b = buffer; - while (channels >= 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - const v128_t vacc = wasm_v128_load(b); - b += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_pmax(vmin, vout); - vout = wasm_f32x4_pmin(vmax, vout); - - wasm_v128_store(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const v128_t vi0 = wasm_v128_load(i0); - const v128_t vi1 = wasm_v128_load(i1); - const v128_t vi2 = wasm_v128_load(i2); - const v128_t vi3 = wasm_v128_load(i3); - const v128_t vi4 = wasm_v128_load(i4); - const v128_t vi5 = wasm_v128_load(i5); - const v128_t vi6 = wasm_v128_load(i6); - const v128_t vacc = wasm_v128_load(b); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - const v128_t vsum6a = wasm_f32x4_add(vi6, vacc); - - const v128_t vsum0123 = wasm_f32x4_add(vsum01, vsum23); - const v128_t vsum456a = wasm_f32x4_add(vsum45, vsum6a); - - const v128_t vsum = wasm_f32x4_add(vsum0123, vsum456a); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_pmax(vmin, vout); - vout = wasm_f32x4_pmin(vmax, vout); - - if (channels & 2) { - wasm_v128_store64_lane(output, vout, 0); - vout = wasm_v64x2_shuffle(vout, vout, 1, 1); - output += 2; - } - if (channels & 1) { - wasm_v128_store32_lane(output, vout, 0); - output += 1; - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c deleted file mode 100644 index a700a5b108c..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-neon-c4.c +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - const float32x4_t vscale = vld1q_dup_f32(¶ms->scalar.scale); - const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); - const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); - - while (channels >= 4) { - const float32x4_t vi0 = vld1q_f32(i0); i0 += 4; - const float32x4_t vi1 = vld1q_f32(i1); i1 += 4; - const float32x4_t vi2 = vld1q_f32(i2); i2 += 4; - const float32x4_t vi3 = vld1q_f32(i3); i3 += 4; - const float32x4_t vi4 = vld1q_f32(i4); i4 += 4; - const float32x4_t vi5 = vld1q_f32(i5); i5 += 4; - const float32x4_t vi6 = vld1q_f32(i6); i6 += 4; - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - - const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); - const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); - - const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); - - float32x4_t vout = vmulq_f32(vsum, vscale); - vout = vmaxq_f32(vout, vmin); - vout = vminq_f32(vout, vmax); - - vst1q_f32(output, vout); output += 4; - - channels -= 4; - } - if (channels != 0) { - const float32x4_t vi0 = vld1q_f32(i0); - const float32x4_t vi1 = vld1q_f32(i1); - const float32x4_t vi2 = vld1q_f32(i2); - const float32x4_t vi3 = vld1q_f32(i3); - const float32x4_t vi4 = vld1q_f32(i4); - const float32x4_t vi5 = vld1q_f32(i5); - const float32x4_t vi6 = vld1q_f32(i6); - - const float32x4_t vsum01 = vaddq_f32(vi0, vi1); - const float32x4_t vsum23 = vaddq_f32(vi2, vi3); - const float32x4_t vsum45 = vaddq_f32(vi4, vi5); - - const float32x4_t vsum016 = vaddq_f32(vsum01, vi6); - const float32x4_t vsum2345 = vaddq_f32(vsum23, vsum45); - - const float32x4_t vsum = vaddq_f32(vsum016, vsum2345); - - float32x4_t vout = vmulq_f32(vsum, vscale); - vout = vmaxq_f32(vout, vmin); - vout = vminq_f32(vout, vmax); - - float32x2_t vout_lo = vget_low_f32(vout); - if (channels & 2) { - vst1_f32(output, vout_lo); output += 2; - vout_lo = vget_high_f32(vout); - } - if (channels & 1) { - vst1_lane_f32(output, vout_lo, 0); - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c deleted file mode 100644 index 6a495250659..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-scalar-c1.c +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float vscale = params->scalar.scale; - const float vmin = params->scalar.min; - const float vmax = params->scalar.max; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - - const float vsum016 = vsum01 + vi6; - const float vsum2345 = vsum23 + vsum45; - - const float vsum = vsum016 + vsum2345; - - float vout = vsum * vscale; - vout = math_max_f32(vout, vmin); - vout = math_min_f32(vout, vmax); - - *output++ = vout; - } while (--channels != 0); -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c deleted file mode 100644 index ea403ef16c5..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-sse-c4.c +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - const __m128 vscale = _mm_set1_ps(params->scalar.scale); - const __m128 vmin = _mm_set1_ps(params->scalar.min); - const __m128 vmax = _mm_set1_ps(params->scalar.max); - XNN_FORCE_REALIZATION(vscale); - XNN_FORCE_REALIZATION(vmin); - XNN_FORCE_REALIZATION(vmax); - - while (channels >= 4) { - const __m128 vi0 = _mm_loadu_ps(i0); - i0 += 4; - const __m128 vi1 = _mm_loadu_ps(i1); - i1 += 4; - const __m128 vi2 = _mm_loadu_ps(i2); - i2 += 4; - const __m128 vi3 = _mm_loadu_ps(i3); - i3 += 4; - const __m128 vi4 = _mm_loadu_ps(i4); - i4 += 4; - const __m128 vi5 = _mm_loadu_ps(i5); - i5 += 4; - const __m128 vi6 = _mm_loadu_ps(i6); - i6 += 4; - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - - const __m128 vsum016 = _mm_add_ps(vsum01, vi6); - const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45); - - const __m128 vsum = _mm_add_ps(vsum016, vsum2345); - - __m128 vout = _mm_mul_ps(vsum, vscale); - vout = _mm_max_ps(vout, vmin); - vout = _mm_min_ps(vout, vmax); - - _mm_storeu_ps(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const __m128 vi0 = _mm_loadu_ps(i0); - const __m128 vi1 = _mm_loadu_ps(i1); - const __m128 vi2 = _mm_loadu_ps(i2); - const __m128 vi3 = _mm_loadu_ps(i3); - const __m128 vi4 = _mm_loadu_ps(i4); - const __m128 vi5 = _mm_loadu_ps(i5); - const __m128 vi6 = _mm_loadu_ps(i6); - - const __m128 vsum01 = _mm_add_ps(vi0, vi1); - const __m128 vsum23 = _mm_add_ps(vi2, vi3); - const __m128 vsum45 = _mm_add_ps(vi4, vi5); - - const __m128 vsum016 = _mm_add_ps(vsum01, vi6); - const __m128 vsum2345 = _mm_add_ps(vsum23, vsum45); - - const __m128 vsum = _mm_add_ps(vsum016, vsum2345); - - __m128 vout = _mm_mul_ps(vsum, vscale); - vout = _mm_max_ps(vout, vmin); - vout = _mm_min_ps(vout, vmax); - - if (channels & 2) { - _mm_storel_pi((__m64*) output, vout); - vout = _mm_movehl_ps(vout, vout); - output += 2; - } - if (channels & 1) { - _mm_store_ss(output, vout); - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c deleted file mode 100644 index a96dbb1362a..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasm-c1.c +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float vscale = params->scalar.scale; - const float vmin = params->scalar.min; - const float vmax = params->scalar.max; - do { - const float vi0 = *i0++; - const float vi1 = *i1++; - const float vi2 = *i2++; - const float vi3 = *i3++; - const float vi4 = *i4++; - const float vi5 = *i5++; - const float vi6 = *i6++; - - const float vsum01 = vi0 + vi1; - const float vsum23 = vi2 + vi3; - const float vsum45 = vi4 + vi5; - - const float vsum016 = vsum01 + vi6; - const float vsum2345 = vsum23 + vsum45; - - const float vsum = vsum016 + vsum2345; - - float vout = vsum * vscale; - vout = __builtin_wasm_max_f32(vout, vmin); - vout = __builtin_wasm_min_f32(vout, vmax); - - *output++ = vout; - } while (--channels != 0); -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c deleted file mode 100644 index bec887b3c3f..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-arm-c4.c +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); - const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); - - while (channels >= 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_max(vout, vmin); - vout = wasm_f32x4_min(vout, vmax); - - wasm_v128_store(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const v128_t vi0 = wasm_v128_load(i0); - const v128_t vi1 = wasm_v128_load(i1); - const v128_t vi2 = wasm_v128_load(i2); - const v128_t vi3 = wasm_v128_load(i3); - const v128_t vi4 = wasm_v128_load(i4); - const v128_t vi5 = wasm_v128_load(i5); - const v128_t vi6 = wasm_v128_load(i6); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_max(vout, vmin); - vout = wasm_f32x4_min(vout, vmax); - - if (channels & 2) { - wasm_v128_store64_lane(output, vout, 0); - vout = wasm_v64x2_shuffle(vout, vout, 1, 1); - output += 2; - } - if (channels & 1) { - wasm_v128_store32_lane(output, vout, 0); - output += 1; - } - } -} diff --git a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c b/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c deleted file mode 100644 index fa80ec9f90f..00000000000 --- a/src/f32-gavgpool/f32-gavgpool-7x-minmax-wasmsimd-x86-c4.c +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - const v128_t vscale = wasm_v128_load32_splat(¶ms->scalar.scale); - const v128_t vmin = wasm_v128_load32_splat(¶ms->scalar.min); - const v128_t vmax = wasm_v128_load32_splat(¶ms->scalar.max); - - while (channels >= 4) { - const v128_t vi0 = wasm_v128_load(i0); - i0 += 4; - const v128_t vi1 = wasm_v128_load(i1); - i1 += 4; - const v128_t vi2 = wasm_v128_load(i2); - i2 += 4; - const v128_t vi3 = wasm_v128_load(i3); - i3 += 4; - const v128_t vi4 = wasm_v128_load(i4); - i4 += 4; - const v128_t vi5 = wasm_v128_load(i5); - i5 += 4; - const v128_t vi6 = wasm_v128_load(i6); - i6 += 4; - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_pmax(vmin, vout); - vout = wasm_f32x4_pmin(vmax, vout); - - wasm_v128_store(output, vout); - output += 4; - - channels -= 4; - } - if (channels != 0) { - const v128_t vi0 = wasm_v128_load(i0); - const v128_t vi1 = wasm_v128_load(i1); - const v128_t vi2 = wasm_v128_load(i2); - const v128_t vi3 = wasm_v128_load(i3); - const v128_t vi4 = wasm_v128_load(i4); - const v128_t vi5 = wasm_v128_load(i5); - const v128_t vi6 = wasm_v128_load(i6); - - const v128_t vsum01 = wasm_f32x4_add(vi0, vi1); - const v128_t vsum23 = wasm_f32x4_add(vi2, vi3); - const v128_t vsum45 = wasm_f32x4_add(vi4, vi5); - - const v128_t vsum016 = wasm_f32x4_add(vsum01, vi6); - const v128_t vsum2345 = wasm_f32x4_add(vsum23, vsum45); - - const v128_t vsum = wasm_f32x4_add(vsum016, vsum2345); - - v128_t vout = wasm_f32x4_mul(vsum, vscale); - vout = wasm_f32x4_pmax(vmin, vout); - vout = wasm_f32x4_pmin(vmax, vout); - - if (channels & 2) { - wasm_v128_store64_lane(output, vout, 0); - vout = wasm_v64x2_shuffle(vout, vout, 1, 1); - output += 2; - } - if (channels & 1) { - wasm_v128_store32_lane(output, vout, 0); - output += 1; - } - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c b/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c deleted file mode 100755 index 03c358457c3..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c1v.c +++ /dev/null @@ -1,150 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7p7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m1(c); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - - vfloat32m1_t sum01_f32v = __riscv_vfadd_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t sum23_f32v = __riscv_vfadd_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t sum45_f32v = __riscv_vfadd_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t sum016_f32v = __riscv_vfadd_vv_f32m1(sum01_f32v, i6_f32v, n); - vfloat32m1_t sum2345_f32v = __riscv_vfadd_vv_f32m1(sum23_f32v, sum45_f32v, n); - vfloat32m1_t sum_f32v = __riscv_vfadd_vv_f32m1(sum2345_f32v, sum016_f32v, n); - __riscv_vse32_v_f32m1(b, sum_f32v, n); b += n; - - c -= n; - } - - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m1(c); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - vfloat32m1_t vacc_f32v = __riscv_vle32_v_f32m1(b, n); - - vfloat32m1_t sum01_f32v = __riscv_vfadd_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t sum23_f32v = __riscv_vfadd_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t sum45_f32v = __riscv_vfadd_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t sum6a_f32v = __riscv_vfadd_vv_f32m1(i6_f32v, vacc_f32v, n); - vfloat32m1_t sum0123_f32v = __riscv_vfadd_vv_f32m1(sum01_f32v, sum23_f32v, n); - vfloat32m1_t sum456a_f32v = __riscv_vfadd_vv_f32m1(sum45_f32v, sum6a_f32v, n); - vfloat32m1_t sum_f32v = __riscv_vfadd_vv_f32m1(sum0123_f32v, sum456a_f32v, n); - __riscv_vse32_v_f32m1(b, sum_f32v, n); b += n; - - c -= n; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - - b = buffer; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m1(channels); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - vfloat32m1_t vacc_f32v = __riscv_vle32_v_f32m1(b, n); b += n; - - vfloat32m1_t sum01_f32v = __riscv_vfadd_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t sum23_f32v = __riscv_vfadd_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t sum45_f32v = __riscv_vfadd_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t sum6a_f32v = __riscv_vfadd_vv_f32m1(i6_f32v, vacc_f32v, n); - vfloat32m1_t sum0123_f32v = __riscv_vfadd_vv_f32m1(sum01_f32v, sum23_f32v, n); - vfloat32m1_t sum456a_f32v = __riscv_vfadd_vv_f32m1(sum45_f32v, sum6a_f32v, n); - vfloat32m1_t sum_f32v = __riscv_vfadd_vv_f32m1(sum0123_f32v, sum456a_f32v, n); - vfloat32m1_t out_f32v = __riscv_vfmul_vf_f32m1(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m1(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c b/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c deleted file mode 100755 index 85bdfe092c5..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c2v.c +++ /dev/null @@ -1,150 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7p7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m2(c); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - - vfloat32m2_t sum01_f32v = __riscv_vfadd_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t sum23_f32v = __riscv_vfadd_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t sum45_f32v = __riscv_vfadd_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t sum016_f32v = __riscv_vfadd_vv_f32m2(sum01_f32v, i6_f32v, n); - vfloat32m2_t sum2345_f32v = __riscv_vfadd_vv_f32m2(sum23_f32v, sum45_f32v, n); - vfloat32m2_t sum_f32v = __riscv_vfadd_vv_f32m2(sum2345_f32v, sum016_f32v, n); - __riscv_vse32_v_f32m2(b, sum_f32v, n); b += n; - - c -= n; - } - - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m2(c); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - vfloat32m2_t vacc_f32v = __riscv_vle32_v_f32m2(b, n); - - vfloat32m2_t sum01_f32v = __riscv_vfadd_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t sum23_f32v = __riscv_vfadd_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t sum45_f32v = __riscv_vfadd_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t sum6a_f32v = __riscv_vfadd_vv_f32m2(i6_f32v, vacc_f32v, n); - vfloat32m2_t sum0123_f32v = __riscv_vfadd_vv_f32m2(sum01_f32v, sum23_f32v, n); - vfloat32m2_t sum456a_f32v = __riscv_vfadd_vv_f32m2(sum45_f32v, sum6a_f32v, n); - vfloat32m2_t sum_f32v = __riscv_vfadd_vv_f32m2(sum0123_f32v, sum456a_f32v, n); - __riscv_vse32_v_f32m2(b, sum_f32v, n); b += n; - - c -= n; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - - b = buffer; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m2(channels); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - vfloat32m2_t vacc_f32v = __riscv_vle32_v_f32m2(b, n); b += n; - - vfloat32m2_t sum01_f32v = __riscv_vfadd_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t sum23_f32v = __riscv_vfadd_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t sum45_f32v = __riscv_vfadd_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t sum6a_f32v = __riscv_vfadd_vv_f32m2(i6_f32v, vacc_f32v, n); - vfloat32m2_t sum0123_f32v = __riscv_vfadd_vv_f32m2(sum01_f32v, sum23_f32v, n); - vfloat32m2_t sum456a_f32v = __riscv_vfadd_vv_f32m2(sum45_f32v, sum6a_f32v, n); - vfloat32m2_t sum_f32v = __riscv_vfadd_vv_f32m2(sum0123_f32v, sum456a_f32v, n); - vfloat32m2_t out_f32v = __riscv_vfmul_vf_f32m2(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m2(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c b/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c deleted file mode 100755 index 4a5431ee14d..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7p7x-minmax-rvv-c4v.c +++ /dev/null @@ -1,150 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7p7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m4(c); - - vfloat32m4_t i0_f32v = __riscv_vle32_v_f32m4(i0, n); i0 += n; - vfloat32m4_t i1_f32v = __riscv_vle32_v_f32m4(i1, n); i1 += n; - vfloat32m4_t i2_f32v = __riscv_vle32_v_f32m4(i2, n); i2 += n; - vfloat32m4_t i3_f32v = __riscv_vle32_v_f32m4(i3, n); i3 += n; - vfloat32m4_t i4_f32v = __riscv_vle32_v_f32m4(i4, n); i4 += n; - vfloat32m4_t i5_f32v = __riscv_vle32_v_f32m4(i5, n); i5 += n; - vfloat32m4_t i6_f32v = __riscv_vle32_v_f32m4(i6, n); i6 += n; - - vfloat32m4_t sum01_f32v = __riscv_vfadd_vv_f32m4(i0_f32v, i1_f32v, n); - vfloat32m4_t sum23_f32v = __riscv_vfadd_vv_f32m4(i2_f32v, i3_f32v, n); - vfloat32m4_t sum45_f32v = __riscv_vfadd_vv_f32m4(i4_f32v, i5_f32v, n); - vfloat32m4_t sum016_f32v = __riscv_vfadd_vv_f32m4(sum01_f32v, i6_f32v, n); - vfloat32m4_t sum2345_f32v = __riscv_vfadd_vv_f32m4(sum23_f32v, sum45_f32v, n); - vfloat32m4_t sum_f32v = __riscv_vfadd_vv_f32m4(sum2345_f32v, sum016_f32v, n); - __riscv_vse32_v_f32m4(b, sum_f32v, n); b += n; - - c -= n; - } - - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m4(c); - - vfloat32m4_t i0_f32v = __riscv_vle32_v_f32m4(i0, n); i0 += n; - vfloat32m4_t i1_f32v = __riscv_vle32_v_f32m4(i1, n); i1 += n; - vfloat32m4_t i2_f32v = __riscv_vle32_v_f32m4(i2, n); i2 += n; - vfloat32m4_t i3_f32v = __riscv_vle32_v_f32m4(i3, n); i3 += n; - vfloat32m4_t i4_f32v = __riscv_vle32_v_f32m4(i4, n); i4 += n; - vfloat32m4_t i5_f32v = __riscv_vle32_v_f32m4(i5, n); i5 += n; - vfloat32m4_t i6_f32v = __riscv_vle32_v_f32m4(i6, n); i6 += n; - vfloat32m4_t vacc_f32v = __riscv_vle32_v_f32m4(b, n); - - vfloat32m4_t sum01_f32v = __riscv_vfadd_vv_f32m4(i0_f32v, i1_f32v, n); - vfloat32m4_t sum23_f32v = __riscv_vfadd_vv_f32m4(i2_f32v, i3_f32v, n); - vfloat32m4_t sum45_f32v = __riscv_vfadd_vv_f32m4(i4_f32v, i5_f32v, n); - vfloat32m4_t sum6a_f32v = __riscv_vfadd_vv_f32m4(i6_f32v, vacc_f32v, n); - vfloat32m4_t sum0123_f32v = __riscv_vfadd_vv_f32m4(sum01_f32v, sum23_f32v, n); - vfloat32m4_t sum456a_f32v = __riscv_vfadd_vv_f32m4(sum45_f32v, sum6a_f32v, n); - vfloat32m4_t sum_f32v = __riscv_vfadd_vv_f32m4(sum0123_f32v, sum456a_f32v, n); - __riscv_vse32_v_f32m4(b, sum_f32v, n); b += n; - - c -= n; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - - b = buffer; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m4(channels); - - vfloat32m4_t i0_f32v = __riscv_vle32_v_f32m4(i0, n); i0 += n; - vfloat32m4_t i1_f32v = __riscv_vle32_v_f32m4(i1, n); i1 += n; - vfloat32m4_t i2_f32v = __riscv_vle32_v_f32m4(i2, n); i2 += n; - vfloat32m4_t i3_f32v = __riscv_vle32_v_f32m4(i3, n); i3 += n; - vfloat32m4_t i4_f32v = __riscv_vle32_v_f32m4(i4, n); i4 += n; - vfloat32m4_t i5_f32v = __riscv_vle32_v_f32m4(i5, n); i5 += n; - vfloat32m4_t i6_f32v = __riscv_vle32_v_f32m4(i6, n); i6 += n; - vfloat32m4_t vacc_f32v = __riscv_vle32_v_f32m4(b, n); b += n; - - vfloat32m4_t sum01_f32v = __riscv_vfadd_vv_f32m4(i0_f32v, i1_f32v, n); - vfloat32m4_t sum23_f32v = __riscv_vfadd_vv_f32m4(i2_f32v, i3_f32v, n); - vfloat32m4_t sum45_f32v = __riscv_vfadd_vv_f32m4(i4_f32v, i5_f32v, n); - vfloat32m4_t sum6a_f32v = __riscv_vfadd_vv_f32m4(i6_f32v, vacc_f32v, n); - vfloat32m4_t sum0123_f32v = __riscv_vfadd_vv_f32m4(sum01_f32v, sum23_f32v, n); - vfloat32m4_t sum456a_f32v = __riscv_vfadd_vv_f32m4(sum45_f32v, sum6a_f32v, n); - vfloat32m4_t sum_f32v = __riscv_vfadd_vv_f32m4(sum0123_f32v, sum456a_f32v, n); - vfloat32m4_t out_f32v = __riscv_vfmul_vf_f32m4(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m4(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c b/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c deleted file mode 100755 index 6e06d1a65e7..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c1v.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m1(channels); - - vfloat32m1_t i0_f32v = __riscv_vle32_v_f32m1(i0, n); i0 += n; - vfloat32m1_t i1_f32v = __riscv_vle32_v_f32m1(i1, n); i1 += n; - vfloat32m1_t i2_f32v = __riscv_vle32_v_f32m1(i2, n); i2 += n; - vfloat32m1_t i3_f32v = __riscv_vle32_v_f32m1(i3, n); i3 += n; - vfloat32m1_t i4_f32v = __riscv_vle32_v_f32m1(i4, n); i4 += n; - vfloat32m1_t i5_f32v = __riscv_vle32_v_f32m1(i5, n); i5 += n; - vfloat32m1_t i6_f32v = __riscv_vle32_v_f32m1(i6, n); i6 += n; - - vfloat32m1_t sum01_f32v = __riscv_vfadd_vv_f32m1(i0_f32v, i1_f32v, n); - vfloat32m1_t sum23_f32v = __riscv_vfadd_vv_f32m1(i2_f32v, i3_f32v, n); - vfloat32m1_t sum45_f32v = __riscv_vfadd_vv_f32m1(i4_f32v, i5_f32v, n); - vfloat32m1_t sum016_f32v = __riscv_vfadd_vv_f32m1(sum01_f32v, i6_f32v, n); - vfloat32m1_t sum2345_f32v = __riscv_vfadd_vv_f32m1(sum23_f32v, sum45_f32v, n); - vfloat32m1_t sum_f32v = __riscv_vfadd_vv_f32m1(sum2345_f32v, sum016_f32v, n); - vfloat32m1_t out_f32v = __riscv_vfmul_vf_f32m1(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m1(__riscv_vfmax_vf_f32m1(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m1(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c b/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c deleted file mode 100755 index 89bb91a60d0..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c2v.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m2(channels); - - vfloat32m2_t i0_f32v = __riscv_vle32_v_f32m2(i0, n); i0 += n; - vfloat32m2_t i1_f32v = __riscv_vle32_v_f32m2(i1, n); i1 += n; - vfloat32m2_t i2_f32v = __riscv_vle32_v_f32m2(i2, n); i2 += n; - vfloat32m2_t i3_f32v = __riscv_vle32_v_f32m2(i3, n); i3 += n; - vfloat32m2_t i4_f32v = __riscv_vle32_v_f32m2(i4, n); i4 += n; - vfloat32m2_t i5_f32v = __riscv_vle32_v_f32m2(i5, n); i5 += n; - vfloat32m2_t i6_f32v = __riscv_vle32_v_f32m2(i6, n); i6 += n; - - vfloat32m2_t sum01_f32v = __riscv_vfadd_vv_f32m2(i0_f32v, i1_f32v, n); - vfloat32m2_t sum23_f32v = __riscv_vfadd_vv_f32m2(i2_f32v, i3_f32v, n); - vfloat32m2_t sum45_f32v = __riscv_vfadd_vv_f32m2(i4_f32v, i5_f32v, n); - vfloat32m2_t sum016_f32v = __riscv_vfadd_vv_f32m2(sum01_f32v, i6_f32v, n); - vfloat32m2_t sum2345_f32v = __riscv_vfadd_vv_f32m2(sum23_f32v, sum45_f32v, n); - vfloat32m2_t sum_f32v = __riscv_vfadd_vv_f32m2(sum2345_f32v, sum016_f32v, n); - vfloat32m2_t out_f32v = __riscv_vfmul_vf_f32m2(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m2(__riscv_vfmax_vf_f32m2(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m2(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c b/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c deleted file mode 100755 index 9bc4be68427..00000000000 --- a/src/f32-gavgpool/gen/f32-gavgpool-7x-minmax-rvv-c4v.c +++ /dev/null @@ -1,79 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/f32-gavgpool/rvv_7x.c.in -// Generator: tools/xngen -// -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m4(channels); - - vfloat32m4_t i0_f32v = __riscv_vle32_v_f32m4(i0, n); i0 += n; - vfloat32m4_t i1_f32v = __riscv_vle32_v_f32m4(i1, n); i1 += n; - vfloat32m4_t i2_f32v = __riscv_vle32_v_f32m4(i2, n); i2 += n; - vfloat32m4_t i3_f32v = __riscv_vle32_v_f32m4(i3, n); i3 += n; - vfloat32m4_t i4_f32v = __riscv_vle32_v_f32m4(i4, n); i4 += n; - vfloat32m4_t i5_f32v = __riscv_vle32_v_f32m4(i5, n); i5 += n; - vfloat32m4_t i6_f32v = __riscv_vle32_v_f32m4(i6, n); i6 += n; - - vfloat32m4_t sum01_f32v = __riscv_vfadd_vv_f32m4(i0_f32v, i1_f32v, n); - vfloat32m4_t sum23_f32v = __riscv_vfadd_vv_f32m4(i2_f32v, i3_f32v, n); - vfloat32m4_t sum45_f32v = __riscv_vfadd_vv_f32m4(i4_f32v, i5_f32v, n); - vfloat32m4_t sum016_f32v = __riscv_vfadd_vv_f32m4(sum01_f32v, i6_f32v, n); - vfloat32m4_t sum2345_f32v = __riscv_vfadd_vv_f32m4(sum23_f32v, sum45_f32v, n); - vfloat32m4_t sum_f32v = __riscv_vfadd_vv_f32m4(sum2345_f32v, sum016_f32v, n); - vfloat32m4_t out_f32v = __riscv_vfmul_vf_f32m4(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m4(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/rvv_7p7x.c.in b/src/f32-gavgpool/rvv_7p7x.c.in deleted file mode 100755 index 9f4ca6e86b3..00000000000 --- a/src/f32-gavgpool/rvv_7p7x.c.in +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert LMUL in [1, 2, 4, 8] -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c${LMUL}v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - channels * sizeof(float); - - float* b = buffer; - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m${LMUL}(c); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - - vfloat32m${LMUL}_t sum01_f32v = __riscv_vfadd_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t sum23_f32v = __riscv_vfadd_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t sum45_f32v = __riscv_vfadd_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t sum016_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum01_f32v, i6_f32v, n); - vfloat32m${LMUL}_t sum2345_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum23_f32v, sum45_f32v, n); - vfloat32m${LMUL}_t sum_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum2345_f32v, sum016_f32v, n); - __riscv_vse32_v_f32m${LMUL}(b, sum_f32v, n); b += n; - - c -= n; - } - - for (rows -= 7; rows > 7; rows -= 7) { - b = buffer; - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - i2 = (const float*) ((uintptr_t) i2 + input_increment); - i3 = (const float*) ((uintptr_t) i3 + input_increment); - i4 = (const float*) ((uintptr_t) i4 + input_increment); - i5 = (const float*) ((uintptr_t) i5 + input_increment); - i6 = (const float*) ((uintptr_t) i6 + input_increment); - - for (size_t c = channels; c != 0; ) { - int32_t n = __riscv_vsetvl_e32m${LMUL}(c); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - vfloat32m${LMUL}_t vacc_f32v = __riscv_vle32_v_f32m${LMUL}(b, n); - - vfloat32m${LMUL}_t sum01_f32v = __riscv_vfadd_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t sum23_f32v = __riscv_vfadd_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t sum45_f32v = __riscv_vfadd_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t sum6a_f32v = __riscv_vfadd_vv_f32m${LMUL}(i6_f32v, vacc_f32v, n); - vfloat32m${LMUL}_t sum0123_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum01_f32v, sum23_f32v, n); - vfloat32m${LMUL}_t sum456a_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum45_f32v, sum6a_f32v, n); - vfloat32m${LMUL}_t sum_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum0123_f32v, sum456a_f32v, n); - __riscv_vse32_v_f32m${LMUL}(b, sum_f32v, n); b += n; - - c -= n; - } - } - - i0 = (const float*) ((uintptr_t) i0 + input_increment); - i1 = (const float*) ((uintptr_t) i1 + input_increment); - if (rows < 2) { - i1 = zero; - } - i2 = (const float*) ((uintptr_t) i2 + input_increment); - if (rows <= 2) { - i2 = zero; - } - i3 = (const float*) ((uintptr_t) i3 + input_increment); - if (rows < 4) { - i3 = zero; - } - i4 = (const float*) ((uintptr_t) i4 + input_increment); - if (rows <= 4) { - i4 = zero; - } - i5 = (const float*) ((uintptr_t) i5 + input_increment); - if (rows < 6) { - i5 = zero; - } - i6 = (const float*) ((uintptr_t) i6 + input_increment); - if (rows <= 6) { - i6 = zero; - } - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - - b = buffer; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m${LMUL}(channels); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - vfloat32m${LMUL}_t vacc_f32v = __riscv_vle32_v_f32m${LMUL}(b, n); b += n; - - vfloat32m${LMUL}_t sum01_f32v = __riscv_vfadd_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t sum23_f32v = __riscv_vfadd_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t sum45_f32v = __riscv_vfadd_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t sum6a_f32v = __riscv_vfadd_vv_f32m${LMUL}(i6_f32v, vacc_f32v, n); - vfloat32m${LMUL}_t sum0123_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum01_f32v, sum23_f32v, n); - vfloat32m${LMUL}_t sum456a_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum45_f32v, sum6a_f32v, n); - vfloat32m${LMUL}_t sum_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum0123_f32v, sum456a_f32v, n); - vfloat32m${LMUL}_t out_f32v = __riscv_vfmul_vf_f32m${LMUL}(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m${LMUL}(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/f32-gavgpool/rvv_7x.c.in b/src/f32-gavgpool/rvv_7x.c.in deleted file mode 100755 index 9c8495d544c..00000000000 --- a/src/f32-gavgpool/rvv_7x.c.in +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright 2024 Imagination Technologies, inc. -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert LMUL in [1, 2, 4, 8] -#include -#include "xnnpack/gavgpool.h" -#include - -void xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c${LMUL}v( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const float* i0 = input; - const float* i1 = (const float*) ((uintptr_t) i0 + input_stride); - if (rows < 2) { - i1 = zero; - } - const float* i2 = (const float*) ((uintptr_t) i1 + input_stride); - if (rows <= 2) { - i2 = zero; - } - const float* i3 = (const float*) ((uintptr_t) i2 + input_stride); - if (rows < 4) { - i3 = zero; - } - const float* i4 = (const float*) ((uintptr_t) i3 + input_stride); - if (rows <= 4) { - i4 = zero; - } - const float* i5 = (const float*) ((uintptr_t) i4 + input_stride); - if (rows < 6) { - i5 = zero; - } - const float* i6 = (const float*) ((uintptr_t) i5 + input_stride); - if (rows <= 6) { - i6 = zero; - } - - const float scale = params->scalar.scale; - const float min = params->scalar.min; - const float max = params->scalar.max; - for (; channels != 0; ) { - int32_t n = __riscv_vsetvl_e32m${LMUL}(channels); - - vfloat32m${LMUL}_t i0_f32v = __riscv_vle32_v_f32m${LMUL}(i0, n); i0 += n; - vfloat32m${LMUL}_t i1_f32v = __riscv_vle32_v_f32m${LMUL}(i1, n); i1 += n; - vfloat32m${LMUL}_t i2_f32v = __riscv_vle32_v_f32m${LMUL}(i2, n); i2 += n; - vfloat32m${LMUL}_t i3_f32v = __riscv_vle32_v_f32m${LMUL}(i3, n); i3 += n; - vfloat32m${LMUL}_t i4_f32v = __riscv_vle32_v_f32m${LMUL}(i4, n); i4 += n; - vfloat32m${LMUL}_t i5_f32v = __riscv_vle32_v_f32m${LMUL}(i5, n); i5 += n; - vfloat32m${LMUL}_t i6_f32v = __riscv_vle32_v_f32m${LMUL}(i6, n); i6 += n; - - vfloat32m${LMUL}_t sum01_f32v = __riscv_vfadd_vv_f32m${LMUL}(i0_f32v, i1_f32v, n); - vfloat32m${LMUL}_t sum23_f32v = __riscv_vfadd_vv_f32m${LMUL}(i2_f32v, i3_f32v, n); - vfloat32m${LMUL}_t sum45_f32v = __riscv_vfadd_vv_f32m${LMUL}(i4_f32v, i5_f32v, n); - vfloat32m${LMUL}_t sum016_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum01_f32v, i6_f32v, n); - vfloat32m${LMUL}_t sum2345_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum23_f32v, sum45_f32v, n); - vfloat32m${LMUL}_t sum_f32v = __riscv_vfadd_vv_f32m${LMUL}(sum2345_f32v, sum016_f32v, n); - vfloat32m${LMUL}_t out_f32v = __riscv_vfmul_vf_f32m${LMUL}(sum_f32v, scale, n); - out_f32v = __riscv_vfmin_vf_f32m${LMUL}(__riscv_vfmax_vf_f32m${LMUL}(out_f32v, min, n), max, n); - __riscv_vse32_v_f32m${LMUL}(output, out_f32v, n); output += n; - - channels -= n; - } -} diff --git a/src/microparams-init.c b/src/microparams-init.c index df4014b7d7e..97c49cce527 100644 --- a/src/microparams-init.c +++ b/src/microparams-init.c @@ -1380,144 +1380,6 @@ size_t xnn_init_f32_scaleminmax_scalar_params( return sizeof(params->scalar); } -size_t xnn_init_f32_gavgpool_scalar_params( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - float output_min, - float output_max, - uint32_t width) -{ - params->scalar.multiplier = multiplier; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; - - const uint32_t w = (width - 1) & 3; - params->scalar.mask[0] = UINT32_C(0xFFFFFFFF); - params->scalar.mask[1] = -(int32_t) (w >= 1); - params->scalar.mask[2] = -(int32_t) (w >= 2); - params->scalar.mask[3] = -(int32_t) (w >= 3); - return sizeof(params->scalar); -} - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -size_t xnn_init_f32_gavgpool_neon_params( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - float output_min, - float output_max, - uint32_t width) -{ - params->neon.multiplier = multiplier; - params->neon.output_min = output_min; - params->neon.output_max = output_max; - - const uint32_t w = (width - 1) & 3; - params->neon.mask[0] = UINT32_C(0xFFFFFFFF); - params->neon.mask[1] = -(uint32_t) (w >= 1); - params->neon.mask[2] = -(uint32_t) (w >= 2); - params->neon.mask[3] = -(uint32_t) (w >= 3); - return sizeof(params->neon); -} -#endif - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 -size_t xnn_init_f32_gavgpool_sse_params( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - float output_min, - float output_max, - uint32_t width) -{ - for (uint32_t i = 0; i < 4; i++) { - params->sse.multiplier[i] = multiplier; - params->sse.output_min[i] = output_min; - params->sse.output_max[i] = output_max; - } - - const uint32_t w = (width - 1) & 3; - params->sse.mask[0] = UINT32_C(0xFFFFFFFF); - params->sse.mask[1] = -(uint32_t) (w >= 1); - params->sse.mask[2] = -(uint32_t) (w >= 2); - params->sse.mask[3] = -(uint32_t) (w >= 3); - return sizeof(params->sse); -} -#endif - -size_t xnn_init_f16_gavgpool_scalar_params( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint16_t output_min, - uint16_t output_max, - uint32_t width) -{ - params->scalar.multiplier = multiplier; - params->scalar.output_min = output_min; - params->scalar.output_max = output_max; - - const uint32_t w = (width - 1) & 7; - params->scalar.mask[0] = UINT16_C(0xFFFF); - params->scalar.mask[1] = -(uint16_t) (w >= 1); - params->scalar.mask[2] = -(uint16_t) (w >= 2); - params->scalar.mask[3] = -(uint16_t) (w >= 3); - params->scalar.mask[4] = -(uint16_t) (w >= 4); - params->scalar.mask[5] = -(uint16_t) (w >= 5); - params->scalar.mask[6] = -(uint16_t) (w >= 6); - params->scalar.mask[7] = -(uint16_t) (w >= 7); - return sizeof(params->scalar); -} - -void xnn_update_f32_gavgpool_params( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - uint32_t width) -{ - #if XNN_ARCH_X86 || XNN_ARCH_X86_64 - for (uint32_t i = 0; i < 4; i++) { - params->sse.multiplier[i] = multiplier; - } - - const uint32_t w = (width - 1) & 3; - params->sse.mask[0] = UINT32_C(0xFFFFFFFF); - params->sse.mask[1] = -(uint32_t) (w >= 1); - params->sse.mask[2] = -(uint32_t) (w >= 2); - params->sse.mask[3] = -(uint32_t) (w >= 3); - #elif XNN_ARCH_ARM || XNN_ARCH_ARM64 - params->neon.multiplier = multiplier; - - const uint32_t w = (width - 1) & 3; - params->neon.mask[0] = UINT32_C(0xFFFFFFFF); - params->neon.mask[1] = -(uint32_t) (w >= 1); - params->neon.mask[2] = -(uint32_t) (w >= 2); - params->neon.mask[3] = -(uint32_t) (w >= 3); - #else - params->scalar.multiplier = multiplier; - - const uint32_t w = (width - 1) & 3; - params->scalar.mask[0] = UINT32_C(0xFFFFFFFF); - params->scalar.mask[1] = -(int32_t) (w >= 1); - params->scalar.mask[2] = -(int32_t) (w >= 2); - params->scalar.mask[3] = -(int32_t) (w >= 3); - #endif -} - -void xnn_update_f16_gavgpool_scalar_params( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint32_t width) -{ - params->scalar.multiplier = multiplier; - - const uint32_t w = (width - 1) & 7; - params->scalar.mask[0] = UINT16_C(0xFFFF); - params->scalar.mask[1] = -(uint16_t) (w >= 1); - params->scalar.mask[2] = -(uint16_t) (w >= 2); - params->scalar.mask[3] = -(uint16_t) (w >= 3); - params->scalar.mask[4] = -(uint16_t) (w >= 4); - params->scalar.mask[5] = -(uint16_t) (w >= 5); - params->scalar.mask[6] = -(uint16_t) (w >= 6); - params->scalar.mask[7] = -(uint16_t) (w >= 7); -} - size_t xnn_init_bf16_minmax_scalar_params( struct xnn_bf16_minmax_params params[XNN_MIN_ELEMENTS(1)], xnn_bfloat16 output_min, diff --git a/src/operator-run.c b/src/operator-run.c index bab0e960a7a..7bd27ebddec 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -1417,93 +1417,6 @@ void xnn_compute_pixelwise_average_pooling_multipass_with_thread( &context->params); } -void xnn_compute_global_average_pooling_nwc_unipass( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index) -{ - const void* input = - (const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride); - void* output = - (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride); - - context->unipass_ukernel( - context->input_elements, - context->channels, - input, - context->input_pixel_stride, - context->zero, - output, - &context->params); -} - -void xnn_compute_global_average_pooling_nwc_multipass( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index) -{ - const void* input = - (const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride); - void* output = - (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride); - void* multipass_buffer = - (void*) ((uintptr_t) context->multipass_buffer + batch_index * context->multipass_batch_stride); - - assert(context->multipass_buffer != NULL); - - context->multipass_ukernel( - context->input_elements, - context->channels, - input, - context->input_pixel_stride, - context->zero, - multipass_buffer, - output, - &context->params); -} - -void xnn_compute_global_average_pooling_nwc_multipass_with_thread( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t thread_index, - size_t batch_index) -{ - const void* input = - (const void*) ((uintptr_t) context->input + batch_index * context->input_batch_stride); - void* output = - (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride); - void* multipass_buffer = - (void*) ((uintptr_t) context->multipass_buffer + thread_index * context->multipass_batch_stride); - - assert(context->multipass_buffer != NULL); - - context->multipass_ukernel( - context->input_elements, - context->channels, - input, - context->input_pixel_stride, - context->zero, - multipass_buffer, - output, - &context->params); -} - -void xnn_compute_global_average_pooling_ncw( - const struct global_average_pooling_ncw_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index, - size_t channels_start, - size_t channels_slice) -{ - const void* input = (const void*) ((uintptr_t) context->input + - channels_start * context->input_channel_stride + batch_index * context->input_batch_stride); - void* output = (void*) ((uintptr_t) context->output + - channels_start * context->output_channel_stride + batch_index * context->output_batch_stride); - - context->ukernel( - context->input_elements, - channels_slice, - input, - output, - &context->params); -} - void xnn_compute_resize_bilinear_indirection( const struct resize_bilinear_nhwc_indirection_init_context context[restrict XNN_MIN_ELEMENTS(1)], size_t output_y_start, diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c deleted file mode 100644 index 4e8be75b627..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c16.c +++ /dev/null @@ -1,315 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c deleted file mode 100644 index 767bdf22c34..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c24.c +++ /dev/null @@ -1,437 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c deleted file mode 100644 index 5b85cd2b8b1..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c32.c +++ /dev/null @@ -1,500 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias)); - vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c deleted file mode 100644 index a36fff777f4..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neon-c8.c +++ /dev/null @@ -1,246 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); - const int8x8_t vi1x01234567 = vld1_s8(i1); - const int8x8_t vi2x01234567 = vld1_s8(i2); - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c deleted file mode 100644 index 4346e335af9..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c +++ /dev/null @@ -1,310 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c deleted file mode 100644 index 7b05435cb73..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c +++ /dev/null @@ -1,431 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c deleted file mode 100644 index e2d7f2320bf..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c +++ /dev/null @@ -1,493 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR); - vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c deleted file mode 100644 index 0ed613b43b2..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c +++ /dev/null @@ -1,242 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); - const int8x8_t vi1x01234567 = vld1_s8(i1); - const int8x8_t vi2x01234567 = vld1_s8(i2); - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c deleted file mode 100644 index 9f598920639..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c +++ /dev/null @@ -1,155 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c deleted file mode 100644 index f4b2aa286f4..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c +++ /dev/null @@ -1,261 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c deleted file mode 100644 index d8b1bad84e7..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c +++ /dev/null @@ -1,367 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c deleted file mode 100644 index b4ebf201c45..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c +++ /dev/null @@ -1,156 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c deleted file mode 100644 index 683b8b08596..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c +++ /dev/null @@ -1,265 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c deleted file mode 100644 index 914eda6727b..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c +++ /dev/null @@ -1,373 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2); - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - vout2 = math_max_s32(vout2, vmagic_min); - vout3 = math_max_s32(vout3, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - vout2 = math_min_s32(vout2, vmagic_max); - vout3 = math_min_s32(vout3, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - vout2 -= vmagic_bias_less_zero_point; - vout3 -= vmagic_bias_less_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c deleted file mode 100644 index 5d6a82001b9..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c +++ /dev/null @@ -1,155 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c deleted file mode 100644 index fd03a5b6251..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c +++ /dev/null @@ -1,261 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c deleted file mode 100644 index 04b92221ad2..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c +++ /dev/null @@ -1,367 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2); - const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - int32_t vout2 = vrndacc2 + voutput_zero_point; - int32_t vout3 = vrndacc3 + voutput_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c deleted file mode 100644 index 4854826555f..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c16.c +++ /dev/null @@ -1,432 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 16; channels -= 16) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - buffer += 16; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c deleted file mode 100644 index 52f7af75d0c..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c24.c +++ /dev/null @@ -1,631 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 24; channels -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20))); - buffer += 24; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min); - voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); - - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c deleted file mode 100644 index 1414f2c4aa8..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse2-c8.c +++ /dev/null @@ -1,335 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 8; channels -= 8) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c deleted file mode 100644 index 240a903c00b..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c16.c +++ /dev/null @@ -1,348 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 16; channels -= 16) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - buffer += 16; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c deleted file mode 100644 index fc71610533d..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c24.c +++ /dev/null @@ -1,493 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 24; channels -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20))); - buffer += 24; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c deleted file mode 100644 index 40f8b2a1966..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-sse41-c8.c +++ /dev/null @@ -1,276 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 8; channels -= 8) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c deleted file mode 100644 index 918a43d90cb..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c +++ /dev/null @@ -1,350 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - b += 16; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 16; channels -= 16) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - buffer += 16; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c deleted file mode 100644 index 8ad1e626ad3..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c +++ /dev/null @@ -1,492 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - v128_t vaccGHIJ = wasm_v128_load(b + 16); - v128_t vaccKLMN = wasm_v128_load(b + 20); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 24; channels -= 24) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - v128_t vaccGHIJ = wasm_v128_load(buffer + 16); - v128_t vaccKLMN = wasm_v128_load(buffer + 20); - buffer += 24; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNGHIJKLMN = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNGHIJKLMN = wasm_i8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store64_lane(output + 16, voutGHIJKLMNGHIJKLMN, 0); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c deleted file mode 100644 index 7423db0007a..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c +++ /dev/null @@ -1,556 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_i16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_i16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_i16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_i16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_i16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_i16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_i16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - const v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV)); - const v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - wasm_v128_store(b + 24, vaccOPQR); - wasm_v128_store(b + 28, vaccSTUV); - b += 32; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_i16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_i16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_i16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_i16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_i16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_i16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_i16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - v128_t vaccGHIJ = wasm_v128_load(b + 16); - v128_t vaccKLMN = wasm_v128_load(b + 20); - v128_t vaccOPQR = wasm_v128_load(b + 24); - v128_t vaccSTUV = wasm_v128_load(b + 28); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV)); - vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - wasm_v128_store(b + 24, vaccOPQR); - wasm_v128_store(b + 28, vaccSTUV); - b += 32; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 32; channels -= 32) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_i16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_i16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_i16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_i16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_i16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_i16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_i16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - v128_t vaccGHIJ = wasm_v128_load(buffer + 16); - v128_t vaccKLMN = wasm_v128_load(buffer + 20); - v128_t vaccOPQR = wasm_v128_load(buffer + 24); - v128_t vaccSTUV = wasm_v128_load(buffer + 28); - buffer += 32; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV)); - vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR); - vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale); - vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias); - vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min); - vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = wasm_i8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); - output += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c deleted file mode 100644 index d6fea8e71ce..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c +++ /dev/null @@ -1,278 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 8; channels -= 8) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c deleted file mode 100644 index 4159107afb0..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c16.c +++ /dev/null @@ -1,311 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c deleted file mode 100644 index 71a1dc2839f..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c24.c +++ /dev/null @@ -1,432 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c deleted file mode 100644 index baa6ecbe1d4..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c32.c +++ /dev/null @@ -1,494 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - const int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - const int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - const int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - const int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - const int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - const int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - vacc89AB = vaddw_s16(vacc89AB, vget_low_s16(vsum89ABCDEF)); - vaccCDEF = vaddw_s16(vaccCDEF, vget_high_s16(vsum89ABCDEF)); - vaccGHIJ = vaddw_s16(vaccGHIJ, vget_low_s16(vsumGHIJKLMN)); - vaccKLMN = vaddw_s16(vaccKLMN, vget_high_s16(vsumGHIJKLMN)); - vaccOPQR = vaddw_s16(vaccOPQR, vget_low_s16(vsumOPQRSTUV)); - vaccSTUV = vaddw_s16(vaccSTUV, vget_high_s16(vsumOPQRSTUV)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - vaccOPQR = vqshlq_s32(vaccOPQR, vleft_pre_shift); - vaccSTUV = vqshlq_s32(vaccSTUV, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - vaccOPQR = vqdmulhq_s32(vaccOPQR, vmultiplier); - vaccSTUV = vqdmulhq_s32(vaccSTUV, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - vaccOPQR = vrshlq_s32(vaccOPQR, vleft_post_shift); - vaccSTUV = vrshlq_s32(vaccSTUV, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c deleted file mode 100644 index d0645a672a6..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7p7x-minmax-rndnu-neon-c8.c +++ /dev/null @@ -1,243 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - const int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const int8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const int8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const int8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const int8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const int8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const int8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const int8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); - const int8x8_t vi1x01234567 = vld1_s8(i1); - const int8x8_t vi2x01234567 = vld1_s8(i2); - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - vacc0123 = vaddw_s16(vacc0123, vget_low_s16(vsum01234567)); - vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c deleted file mode 100644 index e32741d8088..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c16.c +++ /dev/null @@ -1,199 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c deleted file mode 100644 index 16088fc645d..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c24.c +++ /dev/null @@ -1,229 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c deleted file mode 100644 index 68e1ccb87d6..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c32.c +++ /dev/null @@ -1,254 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias)); - vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c deleted file mode 100644 index 4242983351b..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neon-c8.c +++ /dev/null @@ -1,168 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neon.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neon.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c deleted file mode 100644 index 1460ec981a7..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c16.c +++ /dev/null @@ -1,194 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c deleted file mode 100644 index 508505448eb..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c24.c +++ /dev/null @@ -1,223 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c deleted file mode 100644 index 31627087870..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c32.c +++ /dev/null @@ -1,247 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR); - vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c deleted file mode 100644 index 4e3c5c37204..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-neonv8-c8.c +++ /dev/null @@ -1,164 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neonv8.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neonv8.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c deleted file mode 100644 index 2b549395b53..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c +++ /dev/null @@ -1,88 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c deleted file mode 100644 index 975839fdcd1..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c +++ /dev/null @@ -1,147 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c deleted file mode 100644 index 852abe10412..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c deleted file mode 100644 index 8e292af5ca8..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c deleted file mode 100644 index 66983552bd8..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c +++ /dev/null @@ -1,151 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c deleted file mode 100644 index bd8ea982675..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c +++ /dev/null @@ -1,195 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2); - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - vout2 = math_max_s32(vout2, vmagic_min); - vout3 = math_max_s32(vout3, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - vout2 = math_min_s32(vout2, vmagic_max); - vout3 = math_min_s32(vout3, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - vout2 -= vmagic_bias_less_zero_point; - vout3 -= vmagic_bias_less_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c deleted file mode 100644 index e4962965bb8..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c +++ /dev/null @@ -1,88 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c deleted file mode 100644 index 6ec75ae7339..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c +++ /dev/null @@ -1,147 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (int8_t) vout; - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c deleted file mode 100644 index 086c4a0c112..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2); - const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - int32_t vout2 = vrndacc2 + voutput_zero_point; - int32_t vout3 = vrndacc3 + voutput_zero_point; - - output[0] = (int8_t) vout0; - output[1] = (int8_t) vout1; - output[2] = (int8_t) vout2; - output[3] = (int8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (int8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c deleted file mode 100644 index 8a92d6acc0d..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c16.c +++ /dev/null @@ -1,252 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 16; channels -= 16) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c deleted file mode 100644 index fa4aa1fa133..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c24.c +++ /dev/null @@ -1,289 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 24; channels -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x89ABCDEF, vi0x89ABCDEF), 8); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi0xGHIJKLMN, vi0xGHIJKLMN), 8); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x89ABCDEF, vi1x89ABCDEF), 8); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi1xGHIJKLMN, vi1xGHIJKLMN), 8); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x89ABCDEF, vi2x89ABCDEF), 8); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi2xGHIJKLMN, vi2xGHIJKLMN), 8); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x89ABCDEF, vi3x89ABCDEF), 8); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi3xGHIJKLMN, vi3xGHIJKLMN), 8); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x89ABCDEF, vi4x89ABCDEF), 8); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi4xGHIJKLMN, vi4xGHIJKLMN), 8); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x89ABCDEF, vi5x89ABCDEF), 8); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi5xGHIJKLMN, vi5xGHIJKLMN), 8); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x89ABCDEF, vi6x89ABCDEF), 8); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_srai_epi16(_mm_unpacklo_epi8(vi6xGHIJKLMN, vi6xGHIJKLMN), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - const __m128i vsgnacc89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc89ABCDEF); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vsgnacc89ABCDEF); - const __m128i vsgnaccGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vaccGHIJKLMN); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vsgnaccGHIJKLMN); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min); - voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); - - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c deleted file mode 100644 index 85abe5b7577..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse2-c8.c +++ /dev/null @@ -1,209 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 8; channels -= 8) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi0x01234567, vi0x01234567), 8); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi1x01234567, vi1x01234567), 8); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi2x01234567, vi2x01234567), 8); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi3x01234567, vi3x01234567), 8); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi4x01234567, vi4x01234567), 8); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi5x01234567, vi5x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_srai_epi16(_mm_unpacklo_epi8(vi6x01234567, vi6x01234567), 8); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vsgnacc01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc01234567); - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vsgnacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vsgnacc01234567); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - vout01234567 = _mm_max_epi16(vout01234567, voutput_min); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (int8_t) vout0123; - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c deleted file mode 100644 index 00af3055efa..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c16.c +++ /dev/null @@ -1,212 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 16; channels -= 16) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c deleted file mode 100644 index ee3f7f34de3..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c24.c +++ /dev/null @@ -1,241 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 24; channels -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - __m128i vacc89AB = _mm_cvtepi16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_srai_epi32(_mm_unpackhi_epi16(vacc89ABCDEF, vacc89ABCDEF), 16); - __m128i vaccGHIJ = _mm_cvtepi16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_srai_epi32(_mm_unpackhi_epi16(vaccGHIJKLMN, vaccGHIJKLMN), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epi8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c deleted file mode 100644 index 3a4be1e814b..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-sse41-c8.c +++ /dev/null @@ -1,178 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 8; channels -= 8) { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepi16_epi32(vacc01234567); - __m128i vacc4567 = _mm_srai_epi32(_mm_unpackhi_epi16(vacc01234567, vacc01234567), 16); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c deleted file mode 100644 index abf1fac52b9..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c +++ /dev/null @@ -1,211 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 16; channels -= 16) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c deleted file mode 100644 index d7bee50b417..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c +++ /dev/null @@ -1,240 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 24; channels -= 24) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNGHIJKLMN = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNGHIJKLMN = wasm_i8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store64_lane(output + 16, voutGHIJKLMNGHIJKLMN, 0); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c deleted file mode 100644 index 68e4ef4bf9d..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c +++ /dev/null @@ -1,266 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 32; channels -= 32) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_i16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_i16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_i16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_i16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_i16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_i16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_i16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_i16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_i16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_i16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_i16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_i16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_i16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_i16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_i16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_i16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_i16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_i16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_i16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_i16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_i16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc89ABCDEF)); - v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccGHIJKLMN)); - v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccGHIJKLMN)); - v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vaccOPQRSTUV)); - v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vaccOPQRSTUV)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR); - vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale); - vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias); - vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min); - vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); - - v128_t vout0123456789ABCDEF = wasm_i8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNOPQRSTUV = wasm_i8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); - - vout0123456789ABCDEF = wasm_i8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = wasm_i8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); - output += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c deleted file mode 100644 index 2ee517e72ef..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c +++ /dev/null @@ -1,177 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 8; channels -= 8) { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const v128_t vxi0x01234567 = wasm_i16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_i16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_i16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_i16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_i16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_i16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_i16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_low_i16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_i32x4_extend_high_i16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_i8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_i8x16_min(vout0123456701234567, voutput_max); - - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c deleted file mode 100644 index 65df918caea..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c16.c +++ /dev/null @@ -1,195 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 16; channels -= 16) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c deleted file mode 100644 index 3cbe96409aa..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c24.c +++ /dev/null @@ -1,224 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 24; channels -= 24) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x8_t voutGHIJKLMN = vqmovn_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_s8(voutGHIJKLMN, vget_low_s8(voutput_min)); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_s8(voutGHIJKLMN, vget_low_s8(voutput_max)); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1_s8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c deleted file mode 100644 index f0301a25336..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c32.c +++ /dev/null @@ -1,248 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x16_t voutput_min = vld1q_dup_s8(¶ms->rndnu_neon.output_min); - const int8x16_t voutput_max = vld1q_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 32; channels -= 32) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi0x89ABCDEF = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xGHIJKLMN = vld1_s8(i0); i0 += 8; - const int8x8_t vi0xOPQRSTUV = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi1x89ABCDEF = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xGHIJKLMN = vld1_s8(i1); i1 += 8; - const int8x8_t vi1xOPQRSTUV = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - const int8x8_t vi2x89ABCDEF = vld1_s8(i2); i2 += 8; - int16x8_t vsum89ABCDEF = vaddl_s8(vi0x89ABCDEF, vi1x89ABCDEF); - const int8x8_t vi2xGHIJKLMN = vld1_s8(i2); i2 += 8; - int16x8_t vsumGHIJKLMN = vaddl_s8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const int8x8_t vi2xOPQRSTUV = vld1_s8(i2); i2 += 8; - int16x8_t vsumOPQRSTUV = vaddl_s8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi3x89ABCDEF = vld1_s8(i3); i3 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi2x89ABCDEF); - const int8x8_t vi3xGHIJKLMN = vld1_s8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi2xGHIJKLMN); - const int8x8_t vi3xOPQRSTUV = vld1_s8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi2xOPQRSTUV); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi4x89ABCDEF = vld1_s8(i4); i4 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi3x89ABCDEF); - const int8x8_t vi4xGHIJKLMN = vld1_s8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi3xGHIJKLMN); - const int8x8_t vi4xOPQRSTUV = vld1_s8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi3xOPQRSTUV); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi5x89ABCDEF = vld1_s8(i5); i5 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi4x89ABCDEF); - const int8x8_t vi5xGHIJKLMN = vld1_s8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi4xGHIJKLMN); - const int8x8_t vi5xOPQRSTUV = vld1_s8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi4xOPQRSTUV); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - const int8x8_t vi6x89ABCDEF = vld1_s8(i6); i6 += 8; - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi5x89ABCDEF); - const int8x8_t vi6xGHIJKLMN = vld1_s8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi5xGHIJKLMN); - const int8x8_t vi6xOPQRSTUV = vld1_s8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_s8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_s8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_s8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - int32x4_t vacc89AB = vaddw_s16(vinit_bias, vget_low_s16(vsum89ABCDEF)); - int32x4_t vaccCDEF = vaddw_s16(vinit_bias, vget_high_s16(vsum89ABCDEF)); - int32x4_t vaccGHIJ = vaddw_s16(vinit_bias, vget_low_s16(vsumGHIJKLMN)); - int32x4_t vaccKLMN = vaddw_s16(vinit_bias, vget_high_s16(vsumGHIJKLMN)); - int32x4_t vaccOPQR = vaddw_s16(vinit_bias, vget_low_s16(vsumOPQRSTUV)); - int32x4_t vaccSTUV = vaddw_s16(vinit_bias, vget_high_s16(vsumOPQRSTUV)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - vaccOPQR = vqshlq_s32(vaccOPQR, vleft_pre_shift); - vaccSTUV = vqshlq_s32(vaccSTUV, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - vaccOPQR = vqdmulhq_s32(vaccOPQR, vmultiplier); - vaccSTUV = vqdmulhq_s32(vaccSTUV, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - vaccOPQR = vrshlq_s32(vaccOPQR, vleft_post_shift); - vaccSTUV = vrshlq_s32(vaccSTUV, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc01234567), vacc89ABCDEF); - int8x16_t voutGHIJKLMNOPQRSTUV = vqmovn_high_s16(vqmovn_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF)); - int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_s8(output, vout0123456789ABCDEF); output += 16; - vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min)); - vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_s8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c b/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c deleted file mode 100644 index 24ef7d774a8..00000000000 --- a/src/qs8-gavgpool/gen/qs8-gavgpool-7x-minmax-rndnu-neon-c8.c +++ /dev/null @@ -1,165 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const int8_t* i0 = input; - const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const int8x8_t voutput_min = vld1_dup_s8(¶ms->rndnu_neon.output_min); - const int8x8_t voutput_max = vld1_dup_s8(¶ms->rndnu_neon.output_max); - for (; channels >= 8; channels -= 8) { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_s8(vout01234567, voutput_min); - - vout01234567 = vmin_s8(vout01234567, voutput_max); - - vst1_s8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const int8x8_t vi0x01234567 = vld1_s8(i0); i0 += 8; - const int8x8_t vi1x01234567 = vld1_s8(i1); i1 += 8; - const int8x8_t vi2x01234567 = vld1_s8(i2); i2 += 8; - int16x8_t vsum01234567 = vaddl_s8(vi0x01234567, vi1x01234567); - - const int8x8_t vi3x01234567 = vld1_s8(i3); i3 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi2x01234567); - const int8x8_t vi4x01234567 = vld1_s8(i4); i4 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi3x01234567); - const int8x8_t vi5x01234567 = vld1_s8(i5); i5 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi4x01234567); - const int8x8_t vi6x01234567 = vld1_s8(i6); i6 += 8; - vsum01234567 = vaddw_s8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_s8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vaddw_s16(vinit_bias, vget_low_s16(vsum01234567)); - int32x4_t vacc4567 = vaddw_s16(vinit_bias, vget_high_s16(vsum01234567)); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - int8x8_t vout01234567 = vqmovn_s16(vacc01234567); - vout01234567 = vmax_s8(vout01234567, voutput_min); - vout01234567 = vmin_s8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_s8(vout01234567), 0); output += 4; - vout01234567 = vext_s8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_s8(vout01234567), 0); output += 2; - vout01234567 = vext_s8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_s8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qs8-gavgpool/multipass-neon.c.in b/src/qs8-gavgpool/multipass-neon.c.in deleted file mode 100644 index 3e83a2d5619..00000000000 --- a/src/qs8-gavgpool/multipass-neon.c.in +++ /dev/null @@ -1,423 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$assert REQUANTIZATION in ["FP32", "RNDNU"] -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -$if ARMV8: - #include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon") -$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] -$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE] -$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE] -$XINT16X8_T = {"QS8": "int16x8_t", "QU8": "uint16x8_t"}[DATATYPE] -$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE] -$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE] -$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE] -$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE] -$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE] -$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE] -$VADDL_X8 = {"QS8": "vaddl_s8", "QU8": "vaddl_u8"}[DATATYPE] -$VADDW_X8 = {"QS8": "vaddw_s8", "QU8": "vaddw_u8"}[DATATYPE] -$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE] -$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE] -$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE] -$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE] -$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE] -$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE] -$VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE] -$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE] -$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE] -$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE] -$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE] -$ISA = "neonv8" if ARMV8 else "neon" -void xnn_${DATATYPE.lower()}_gavgpool_minmax_${REQUANTIZATION.lower()}_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__${ISA}_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - int32_t* buffer, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if CHANNEL_TILE <= 16: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); - $else: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; - ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]})); - const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]})); - $else: - const int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]}))); - const int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]}))); - - $for C in range(0, CHANNEL_TILE, 4): - vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(3): - const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8; - ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - $if DATATYPE == "QS8": - const int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]})); - const int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]})); - $else: - const int32x4_t vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[0:8]}))); - const int32x4_t vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[0:8]}))); - - vst1q_s32(b, vacc${ABC[0:4]}); b += 4; - vst1q_s32(b, vacc${ABC[4:8]}); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; - ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - $else: - $if C == 0: - int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(b); - $else: - int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(b + ${C}); - int32x4_t vacc${ABC[C+4:C+8]} = vld1q_s32(b + ${C+4}); - vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vsum${ABC[C:C+8]})); - vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vsum${ABC[C:C+8]})); - $else: - vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C:C+4]}), vget_low_u16(vsum${ABC[C:C+8]}))); - vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C+4:C+8]}), vget_high_u16(vsum${ABC[C:C+8]}))); - - $for C in range(0, CHANNEL_TILE, 4): - vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(3): - const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8; - ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - $else: - int32x4_t vacc${ABC[0:4]} = vld1q_s32(b); - int32x4_t vacc${ABC[4:8]} = vld1q_s32(b + 4); - vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - $if DATATYPE == "QS8": - vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vsum${ABC[0:8]})); - vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vsum${ABC[0:8]})); - $else: - vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[0:4]}), vget_low_u16(vsum${ABC[0:8]}))); - vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[4:8]}), vget_high_u16(vsum${ABC[0:8]}))); - - vst1q_s32(b, vacc${ABC[0:4]}); b += 4; - vst1q_s32(b, vacc${ABC[4:8]}); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - $if REQUANTIZATION == "FP32": - const float32x4_t vscale = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.scale); - $if ARMV8: - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - $else: - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - $if CHANNEL_TILE > 8: - const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); - const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); - $else: - const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); - const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; - ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - $else: - int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc${ABC[C+4:C+8]} = vld1q_s32(buffer); buffer += 4; - vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vsum${ABC[C:C+8]})); - vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vsum${ABC[C:C+8]})); - $else: - vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C:C+4]}), vget_low_u16(vsum${ABC[C:C+8]}))); - vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[C+4:C+8]}), vget_high_u16(vsum${ABC[C:C+8]}))); - - $if REQUANTIZATION == "FP32": - $for C in range(0, CHANNEL_TILE, 4): - float32x4_t vfpacc${ABC[C:C+4]} = vcvtq_f32_s32(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = vmulq_f32(vfpacc${ABC[C:C+4]}, vscale); - - $if ARMV8: - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vcvtnq_s32_f32(vfpacc${ABC[C:C+4]}); - $else: - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[C:C+4]}, vmagic_bias)); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqsubq_s32(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqshlq_s32(vacc${ABC[C:C+4]}, vleft_pre_shift); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqdmulhq_s32(vacc${ABC[C:C+4]}, vmultiplier); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vrshlq_s32(vacc${ABC[C:C+4]}, vleft_post_shift); - - #if XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 8): - int16x8_t vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]}); - #else // !XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 8): - int16x8_t vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]})); - #endif // !XNN_ARCH_ARM64 - - $if REQUANTIZATION != "FP32" or ARMV8: - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vqaddq_s16(vacc${ABC[C:C+8]}, voutput_zero_point); - - #if XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${XINT8X16_T} vout${ABC[C:C+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]}); - $else: - ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); - #else // !XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${XINT8X16_T} vout${ABC[C:C+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), ${VQMOVXN_S16}(vacc${ABC[C+8:C+16]})); - $else: - ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); - #endif // !XNN_ARCH_ARM64 - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${VMAXQ_X8}(vout${ABC[C:C+16]}, voutput_min); - $elif CHANNEL_TILE > 8: - vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_min)); - $else: - vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, voutput_min); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${VMINQ_X8}(vout${ABC[C:C+16]}, voutput_max); - $elif CHANNEL_TILE > 8: - vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_max)); - $else: - vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, voutput_max); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${VST1Q_X8}(output, vout${ABC[C:C+16]}); output += 16; - $else: - ${VST1_X8}(output, vout${ABC[C:C+8]}); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(3): - $if CHANNEL_TILE > 8: - const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8; - $else: - const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); - ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - $if CHANNEL_TILE > 8: - const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - $else: - const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); - $else: - int32x4_t vacc${ABC[0:4]} = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc${ABC[4:8]} = vld1q_s32(buffer); buffer += 4; - vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - $if DATATYPE == "QS8": - vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vsum${ABC[0:8]})); - vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vsum${ABC[0:8]})); - $else: - vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[0:4]}), vget_low_u16(vsum${ABC[0:8]}))); - vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc${ABC[4:8]}), vget_high_u16(vsum${ABC[0:8]}))); - - $if REQUANTIZATION == "FP32": - float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]}); - float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = vmulq_f32(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = vmulq_f32(vfpacc${ABC[4:8]}, vscale); - - $if ARMV8: - vacc${ABC[0:4]} = vcvtnq_s32_f32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = vcvtnq_s32_f32(vfpacc${ABC[4:8]}); - $else: - vacc${ABC[0:4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[0:4]}, vmagic_bias)); - vacc${ABC[4:8]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[4:8]}, vmagic_bias)); - - vacc${ABC[0:4]} = vqsubq_s32(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); - vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - vacc${ABC[0:4]} = vqshlq_s32(vacc${ABC[0:4]}, vleft_pre_shift); - vacc${ABC[4:8]} = vqshlq_s32(vacc${ABC[4:8]}, vleft_pre_shift); - - vacc${ABC[0:4]} = vqdmulhq_s32(vacc${ABC[0:4]}, vmultiplier); - vacc${ABC[4:8]} = vqdmulhq_s32(vacc${ABC[4:8]}, vmultiplier); - - vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vleft_post_shift); - vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]}); - #else - int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})); - #endif - $if REQUANTIZATION != "FP32" or ARMV8: - vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point); - - ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]}); - $if CHANNEL_TILE > 8: - vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min)); - vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - ${VST1_X8}(output, vout${ABC[0:8]}); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); - } - if (channels & 1) { - ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); output += 1; - } - channels = 0; - } - $else: - vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min); - vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); - } - if (channels & 1) { - ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/multipass-scalar.c.in b/src/qs8-gavgpool/multipass-scalar.c.in deleted file mode 100644 index 366711f5fef..00000000000 --- a/src/qs8-gavgpool/multipass-scalar.c.in +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE >= 1 -$assert CHANNEL_TILE <= 16 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$assert REQUANTIZATION == "FP32" -#include -$if VARIANT == "LRINTF": - #include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower() -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" -$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - int32_t* buffer, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); - - const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias; - int32_t* b = buffer; - $if CHANNEL_TILE == 1: - size_t c = channels; - do { - int32_t vacc = vinit_bias; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_TILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_TILE - 2, ROW_TILE): - vacc += vi${M}; - - *b++ = vacc; - } while (--c != 0); - $else: - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) { - $for C in range(CHANNEL_TILE): - const int32_t vi0x${C} = (int32_t) i0[${C}]; - i0 += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - int32_t vacc${C} = vi0x${C} + vinit_bias; - const int32_t vi1x${C} = (int32_t) i1[${C}]; - i1 += ${CHANNEL_TILE}; - - $for M in range(2, ROW_TILE): - $for C in range(CHANNEL_TILE): - vacc${C} += vi${M-1}x${C}; - const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - vacc${C} += vi${ROW_TILE-1}x${C}; - - $for C in range(CHANNEL_TILE): - b[${C}] = vacc${C}; - b += ${CHANNEL_TILE}; - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - int32_t* b = buffer; - $if CHANNEL_TILE == 1: - size_t c = channels; - do { - int32_t vacc = *b; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_SUBTILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): - vacc += vi${M}; - - *b++ = vacc; - } while (--c != 0); - $else: - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= ${CHANNEL_TILE}) { - $for C in range(CHANNEL_TILE): - int32_t vacc${C} = b[${C}]; - const int32_t vi0x${C} = (int32_t) i0[${C}]; - i0 += ${CHANNEL_TILE}; - - $for M in range(1, ROW_SUBTILE): - $for C in range(CHANNEL_TILE): - vacc${C} += vi${M-1}x${C}; - const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - vacc${C} += vi${ROW_SUBTILE-1}x${C}; - - $for C in range(CHANNEL_TILE): - b[${C}] = vacc${C}; - b += ${CHANNEL_TILE}; - } - } - - i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const float vscale = params->${PARAMS_STRUCT}.scale; - $if VARIANT == "FMAGIC": - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - $if CHANNEL_TILE == 1: - do { - int32_t vacc = *buffer++; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_SUBTILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (${XINT8_T}) vout; - } while (--channels != 0); - $else: - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for C in range(CHANNEL_TILE): - int32_t vacc${C} = buffer[${C}]; - const int32_t vi0x${C} = (int32_t) i0[${C}]; - buffer += ${CHANNEL_TILE}; - i0 += ${CHANNEL_TILE}; - - $for M in range(1, ROW_SUBTILE): - $for C in range(CHANNEL_TILE): - vacc${C} += vi${M-1}x${C}; - const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - vacc${C} += vi${ROW_SUBTILE-1}x${C}; - - $for C in range(CHANNEL_TILE): - float vfpacc${C} = (float) vacc${C} * vscale; - - $if VARIANT == "FMAGIC": - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} += vmagic_bias; - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - $for C in range(CHANNEL_TILE): - vfpacc${C} += vmagic_bias; - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}); - - $for C in range(CHANNEL_TILE): - vout${C} = math_max_s32(vout${C}, vmagic_min); - - $for C in range(CHANNEL_TILE): - vout${C} = math_min_s32(vout${C}, vmagic_max); - - $for C in range(CHANNEL_TILE): - vout${C} -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); - - $for C in range(CHANNEL_TILE): - const int32_t vrndacc${C} = (int32_t) lrintf(vfpacc${C}); - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = vrndacc${C} + voutput_zero_point; - - $for C in range(CHANNEL_TILE): - output[${C}] = (${XINT8_T}) vout${C}; - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - $if CHANNEL_TILE == 2: - int32_t vacc = *buffer; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}; - - $for M in range(2, ROW_SUBTILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}; - - $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (${XINT8_T}) vout; - $else: - do { - int32_t vacc = *buffer++; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_SUBTILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_SUBTILE - 2, ROW_SUBTILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (${XINT8_T}) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/multipass-sse2.c.in b/src/qs8-gavgpool/multipass-sse2.c.in deleted file mode 100644 index 0074c7f92fb..00000000000 --- a/src/qs8-gavgpool/multipass-sse2.c.in +++ /dev/null @@ -1,392 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse2_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - int32_t* buffer, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if CHANNEL_TILE <= 16: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); - $else: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(ROW_TILE + 2): - - $for C in range(0, CHANNEL_TILE, 8): - $if M == 3: - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $elif M > 3: - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $if 1 <= M <= ROW_TILE: - $if DATATYPE == "QS8": - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); - $else: - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); - $if M < ROW_TILE: - $if C == 0: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - $else: - const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); - $if M < ROW_TILE: - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - $else: - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(ROW_TILE + 3): - - $if M == 4: - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $elif M > 4: - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $if 2 <= M <= ROW_TILE + 1: - $if DATATYPE == "QS8": - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); - $else: - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); - $if M < ROW_TILE: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - i${M} += 8; - - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - $else: - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(ROW_SUBTILE + 2): - - $for C in range(0, CHANNEL_TILE, 8): - $if M == 3: - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $elif M > 3: - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $if 1 <= M <= ROW_SUBTILE: - $if DATATYPE == "QS8": - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); - $else: - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); - $if M < ROW_SUBTILE: - $if C == 0: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - $else: - const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); - $if M < ROW_SUBTILE: - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - $else: - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); - $for C in range(4, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C}))); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(ROW_SUBTILE + 3): - - $if M == 4: - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $elif M > 4: - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $if 2 <= M <= ROW_SUBTILE + 1: - $if DATATYPE == "QS8": - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); - $else: - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); - $if M < ROW_SUBTILE: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - i${M} += 8; - - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - $else: - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(ROW_SUBTILE + 2): - - $for C in range(0, CHANNEL_TILE, 8): - $if M == 3: - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $elif M > 3: - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $if 1 <= M <= ROW_SUBTILE: - $if DATATYPE == "QS8": - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); - $else: - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); - $if M < ROW_SUBTILE: - $if C == 0: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - $else: - const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); - $if M < ROW_SUBTILE: - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - $else: - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); - $for C in range(4, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C}))); - buffer += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 4): - __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 8): - __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); - - $if DATATYPE == "QS8": - $for C in range(0, CHANNEL_TILE, 8): - vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); - - $if CHANNEL_TILE > 8: - _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); - $else: - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); - $else: - _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(ROW_SUBTILE + 3): - - $if M == 4: - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $elif M > 4: - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $if 2 <= M <= ROW_SUBTILE + 1: - $if DATATYPE == "QS8": - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); - $else: - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); - $if M < ROW_SUBTILE: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - i${M} += 8; - - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - $else: - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); - __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); - - vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); - vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); - - vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); - - __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); - $if DATATYPE == "QS8": - vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min); - - __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); - $if DATATYPE == "QU8": - vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); - vout${ABC[0:4]} >>= 16; - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) vout${ABC[0:4]}; - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); - vout${ABC[0:4]} >>= 16; - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) vout${ABC[0:4]}; - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/multipass-sse4.c.in b/src/qs8-gavgpool/multipass-sse4.c.in deleted file mode 100644 index 991411daf14..00000000000 --- a/src/qs8-gavgpool/multipass-sse4.c.in +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$_MM_CVTEPX8_EPI16 = {"QS8": "_mm_cvtepi8_epi16", "QU8": "_mm_cvtepu8_epi16"}[DATATYPE] -$_MM_CVTEPX16_EPI32 = {"QS8": "_mm_cvtepi16_epi32", "QU8": "_mm_cvtepu16_epi32"}[DATATYPE] -$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__sse41_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - int32_t* buffer, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if CHANNEL_TILE <= 16: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); - $else: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - $for C in range(8, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - $for C in range(0, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); - $else: - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]}); - - __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); - $else: - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - $for C in range(8, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - $for C in range(0, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); - $else: - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); - $for C in range(4, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (b + ${C}))); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - _mm_store_si128((__m128i*) (b + ${C}), vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); - - __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); - $else: - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) b)); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc${ABC[0:4]}); - _mm_store_si128((__m128i*) (b + 4), vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - $for C in range(8, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - $for C in range(0, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); - $else: - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); - $for C in range(4, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, _mm_load_si128((const __m128i*) (buffer + ${C}))); - buffer += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 4): - __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 8): - __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+16]}, voutput_min); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); - - $if CHANNEL_TILE > 8: - _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); - $else: - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); - $else: - _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); - - __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); - $else: - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_load_si128((const __m128i*) buffer)); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); - __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); - - vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); - vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); - - vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); - - __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); - - __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); - vout${ABC[0:8]}${ABC[0:8]} = ${_MM_MAX_EPX8}(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/multipass-wasmsimd.c.in b/src/qs8-gavgpool/multipass-wasmsimd.c.in deleted file mode 100644 index 890e460c876..00000000000 --- a/src/qs8-gavgpool/multipass-wasmsimd.c.in +++ /dev/null @@ -1,365 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert ROW_SUBTILE >= 3 -$assert ROW_SUBTILE <= ROW_TILE -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] -$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE] -$WASM_X32X4_EXTEND_LOW_X16X8 = {"QS8": "wasm_i32x4_extend_low_i16x8", "QU8": "wasm_u32x4_extend_low_u16x8"}[DATATYPE] -$WASM_X32X4_EXTEND_HIGH_X16X8 = {"QS8": "wasm_i32x4_extend_high_i16x8", "QU8": "wasm_u32x4_extend_high_u16x8"}[DATATYPE] -$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE] -$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - int32_t* buffer, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if CHANNEL_TILE <= 16: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); - $else: - const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - $for C in range(8, CHANNEL_TILE, 8): - v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); - const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); - - wasm_v128_store(b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - i2 += 8; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); - - const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); - const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); - - wasm_v128_store(b, vacc${ABC[0:4]}); - wasm_v128_store(b + 4, vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { - $for M in range(ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - $for C in range(8, CHANNEL_TILE, 8): - v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); - - v128_t vacc${ABC[0:4]} = wasm_v128_load(b); - $for C in range(4, CHANNEL_TILE, 4): - v128_t vacc${ABC[C:C+4]} = wasm_v128_load(b + ${C}); - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); - vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); - - wasm_v128_store(b, vacc${ABC[0:4]}); - $for C in range(4, CHANNEL_TILE, 4): - wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); - b += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE > 16: - if XNN_UNLIKELY(c != 0) { - do { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - i2 += 8; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); - - v128_t vacc${ABC[0:4]} = wasm_v128_load(b); - v128_t vacc${ABC[4:8]} = wasm_v128_load(b + 4); - - vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); - vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); - - wasm_v128_store(b, vacc${ABC[0:4]}); - wasm_v128_store(b + 4, vacc${ABC[4:8]}); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); - $for M in range(1, ROW_SUBTILE): - i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - $for C in range(8, CHANNEL_TILE, 8): - v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); - - v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer); - $for C in range(4, CHANNEL_TILE, 4): - v128_t vacc${ABC[C:C+4]} = wasm_v128_load(buffer + ${C}); - buffer += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); - vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_mul(vacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_add(vacc${ABC[C:C+4]}, vmagic_bias); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_i32x4_max(vacc${ABC[C:C+4]}, vmagic_min); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_i32x4_sub(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); - - $for C in range(0, CHANNEL_TILE, 8): - v128_t vout${ABC[C:C+8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - v128_t vout${ABC[C:C+16]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+16]}, voutput_max); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max); - - $if CHANNEL_TILE > 8: - wasm_v128_store(output, vout${ABC[0:16]}); - $else: - wasm_v128_store64_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - wasm_v128_store(output + ${C}, vout${ABC[C:C+16]}); - $else: - wasm_v128_store64_lane(output + ${C}, vout${ABC[C:C+8]}${ABC[C:C+8]}, 0); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - i2 += 8; - - $for M in range(3, ROW_SUBTILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); - - v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer); - v128_t vacc${ABC[4:8]} = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); - vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); - - vacc${ABC[0:4]} = wasm_f32x4_convert_i32x4(vacc${ABC[0:4]}); - vacc${ABC[4:8]} = wasm_f32x4_convert_i32x4(vacc${ABC[4:8]}); - - vacc${ABC[0:4]} = wasm_f32x4_mul(vacc${ABC[0:4]}, vscale); - vacc${ABC[4:8]} = wasm_f32x4_mul(vacc${ABC[4:8]}, vscale); - - vacc${ABC[0:4]} = wasm_f32x4_add(vacc${ABC[0:4]}, vmagic_bias); - vacc${ABC[4:8]} = wasm_f32x4_add(vacc${ABC[4:8]}, vmagic_bias); - - vacc${ABC[0:4]} = wasm_i32x4_max(vacc${ABC[0:4]}, vmagic_min); - vacc${ABC[4:8]} = wasm_i32x4_max(vacc${ABC[4:8]}, vmagic_min); - - vacc${ABC[0:4]} = wasm_i32x4_sub(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); - vacc${ABC[4:8]} = wasm_i32x4_sub(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); - - const v128_t vout${ABC[0:8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]}); - v128_t vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[0:8]}, vout${ABC[0:8]}); - vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_MIN}(vout${ABC[0:8]}${ABC[0:8]}, voutput_max); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - wasm_v128_store32_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/unipass-neon.c.in b/src/qs8-gavgpool/unipass-neon.c.in deleted file mode 100644 index 79777dbc912..00000000000 --- a/src/qs8-gavgpool/unipass-neon.c.in +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert REQUANTIZATION in ["FP32", "RNDNU"] -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -$if ARMV8: - #include "xnnpack/intrinsics-polyfill.h" - - -$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon") -$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] -$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE] -$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE] -$XINT16X8_T = {"QS8": "int16x8_t", "QU8": "uint16x8_t"}[DATATYPE] -$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE] -$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE] -$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE] -$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE] -$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE] -$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE] -$VADDL_X8 = {"QS8": "vaddl_s8", "QU8": "vaddl_u8"}[DATATYPE] -$VADDW_X8 = {"QS8": "vaddw_s8", "QU8": "vaddw_u8"}[DATATYPE] -$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE] -$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE] -$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE] -$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE] -$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE] -$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE] -$VQMOVXN_HIGH_S16 = {"QS8": "vqmovn_high_s16", "QU8": "vqmovun_high_s16"}[DATATYPE] -$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE] -$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE] -$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE] -$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE] -$ISA = "neonv8" if ARMV8 else "neon" -void xnn_${DATATYPE.lower()}_gavgpool_minmax_${REQUANTIZATION.lower()}_ukernel_${ROW_TILE}x__${ISA}_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.init_bias); - $if REQUANTIZATION == "FP32": - const float32x4_t vscale = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.scale); - $if ARMV8: - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - $else: - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - $if CHANNEL_TILE > 8: - const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); - const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); - $else: - const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_min); - const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->${PARAMS_STRUCT}.output_max); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi${M}x${ABC[C:C+8]} = ${VLD1_X8}(i${M}); i${M} += 8; - - $for C in range(0, CHANNEL_TILE, 8): - const ${XINT8X8_T} vi2x${ABC[C:C+8]} = ${VLD1_X8}(i2); i2 += 8; - ${XINT16X8_T} vsum${ABC[C:C+8]} = ${VADDL_X8}(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); - - $for M in range(2, ROW_TILE): - $for C in range(0, CHANNEL_TILE, 8): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[C:C+8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - vsum${ABC[C:C+8]} = ${VADDW_X8}(vsum${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[C:C+8]})); - int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[C:C+8]})); - $else: - int32x4_t vacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[C:C+8]}))); - int32x4_t vacc${ABC[C+4:C+8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[C:C+8]}))); - - $if REQUANTIZATION == "FP32": - $for C in range(0, CHANNEL_TILE, 4): - float32x4_t vfpacc${ABC[C:C+4]} = vcvtq_f32_s32(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = vmulq_f32(vfpacc${ABC[C:C+4]}, vscale); - - $if ARMV8: - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vcvtnq_s32_f32(vfpacc${ABC[C:C+4]}); - $else: - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[C:C+4]}, vmagic_bias)); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqsubq_s32(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqshlq_s32(vacc${ABC[C:C+4]}, vleft_pre_shift); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vqdmulhq_s32(vacc${ABC[C:C+4]}, vmultiplier); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = vrshlq_s32(vacc${ABC[C:C+4]}, vleft_post_shift); - - #if XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 8): - int16x8_t vacc${ABC[C:C+8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc${ABC[C+4:C+8]}); - #else // !XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 8): - int16x8_t vacc${ABC[C:C+8]} = vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_s32(vacc${ABC[C+4:C+8]})); - #endif // !XNN_ARCH_ARM64 - - $if REQUANTIZATION != "FP32" or ARMV8: - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = vqaddq_s16(vacc${ABC[C:C+8]}, voutput_zero_point); - - #if XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${XINT8X16_T} vout${ABC[C:C+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]}); - $else: - ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); - #else // !XNN_ARCH_ARM64 - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${XINT8X16_T} vout${ABC[C:C+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[C:C+8]}), ${VQMOVXN_S16}(vacc${ABC[C+8:C+16]})); - $else: - ${XINT8X8_T} vout${ABC[C:C+8]} = ${VQMOVXN_S16}(vacc${ABC[C:C+8]}); - #endif // !XNN_ARCH_ARM64 - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${VMAXQ_X8}(vout${ABC[C:C+16]}, voutput_min); - $elif CHANNEL_TILE > 8: - vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_min)); - $else: - vout${ABC[C:C+8]} = ${VMAX_X8}(vout${ABC[C:C+8]}, voutput_min); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${VMINQ_X8}(vout${ABC[C:C+16]}, voutput_max); - $elif CHANNEL_TILE > 8: - vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, ${VGET_LOW_X8}(voutput_max)); - $else: - vout${ABC[C:C+8]} = ${VMIN_X8}(vout${ABC[C:C+8]}, voutput_max); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - ${VST1Q_X8}(output, vout${ABC[C:C+16]}); output += 16; - $else: - ${VST1_X8}(output, vout${ABC[C:C+8]}); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(3): - const ${XINT8X8_T} vi${M}x${ABC[0:8]} = ${VLD1_X8}(i${M}); i${M} += 8; - ${XINT16X8_T} vsum${ABC[0:8]} = ${VADDL_X8}(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); - - $for M in range(2, ROW_TILE): - $if M + 1 != ROW_TILE: - const ${XINT8X8_T} vi${M+1}x${ABC[0:8]} = ${VLD1_X8}(i${M+1}); i${M+1} += 8; - vsum${ABC[0:8]} = ${VADDW_X8}(vsum${ABC[0:8]}, vi${M}x${ABC[0:8]}); - - $if DATATYPE == "QS8": - int32x4_t vacc${ABC[0:4]} = vaddw_s16(vinit_bias, vget_low_s16(vsum${ABC[0:8]})); - int32x4_t vacc${ABC[4:8]} = vaddw_s16(vinit_bias, vget_high_s16(vsum${ABC[0:8]})); - $else: - int32x4_t vacc${ABC[0:4]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum${ABC[0:8]}))); - int32x4_t vacc${ABC[4:8]} = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum${ABC[0:8]}))); - - $if REQUANTIZATION == "FP32": - float32x4_t vfpacc${ABC[0:4]} = vcvtq_f32_s32(vacc${ABC[0:4]}); - float32x4_t vfpacc${ABC[4:8]} = vcvtq_f32_s32(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = vmulq_f32(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = vmulq_f32(vfpacc${ABC[4:8]}, vscale); - - $if ARMV8: - vacc${ABC[0:4]} = vcvtnq_s32_f32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = vcvtnq_s32_f32(vfpacc${ABC[4:8]}); - $else: - vacc${ABC[0:4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[0:4]}, vmagic_bias)); - vacc${ABC[4:8]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${ABC[4:8]}, vmagic_bias)); - - vacc${ABC[0:4]} = vqsubq_s32(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); - vacc${ABC[4:8]} = vqsubq_s32(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); - $elif REQUANTIZATION == "RNDNU": - vacc${ABC[0:4]} = vqshlq_s32(vacc${ABC[0:4]}, vleft_pre_shift); - vacc${ABC[4:8]} = vqshlq_s32(vacc${ABC[4:8]}, vleft_pre_shift); - - vacc${ABC[0:4]} = vqdmulhq_s32(vacc${ABC[0:4]}, vmultiplier); - vacc${ABC[4:8]} = vqdmulhq_s32(vacc${ABC[4:8]}, vmultiplier); - - vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vleft_post_shift); - vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc${ABC[0:8]} = vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${ABC[4:8]}); - #else - int16x8_t vacc${ABC[0:8]} = vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})); - #endif - $if REQUANTIZATION != "FP32" or ARMV8: - vacc${ABC[0:8]} = vqaddq_s16(vacc${ABC[0:8]}, voutput_zero_point); - - ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]}); - $if CHANNEL_TILE > 8: - vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min)); - vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - ${VST1_X8}(output, vout${ABC[0:8]}); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); - } - if (channels & 1) { - ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); output += 1; - } - channels = 0; - } - $else: - vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min); - vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2; - vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2); - } - if (channels & 1) { - ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/unipass-scalar.c.in b/src/qs8-gavgpool/unipass-scalar.c.in deleted file mode 100644 index 769a473b465..00000000000 --- a/src/qs8-gavgpool/unipass-scalar.c.in +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE >= 1 -$assert ROW_TILE >= 3 -$assert REQUANTIZATION == "FP32" -#include -$if VARIANT == "LRINTF": - #include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower() -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" -$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__scalar_${VARIANT.lower()}_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias; - const float vscale = params->${PARAMS_STRUCT}.scale; - $if VARIANT == "FMAGIC": - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - $if CHANNEL_TILE > 1: - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for C in range(CHANNEL_TILE): - const int32_t vi0x${C} = (int32_t) i0[${C}]; - i0 += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - int32_t vacc${C} = vi0x${C} + vinit_bias; - const int32_t vi1x${C} = (int32_t) i1[${C}]; - i1 += ${CHANNEL_TILE}; - - $for M in range(2, ROW_TILE): - $for C in range(CHANNEL_TILE): - vacc${C} += vi${M-1}x${C}; - const int32_t vi${M}x${C} = (int32_t) i${M}[${C}]; - i${M} += ${CHANNEL_TILE}; - - $for C in range(CHANNEL_TILE): - vacc${C} += vi${ROW_TILE-1}x${C}; - - $for C in range(CHANNEL_TILE): - float vfpacc${C} = (float) vacc${C} * vscale; - - $if VARIANT == "FMAGIC": - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} += vmagic_bias; - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - $for C in range(CHANNEL_TILE): - vfpacc${C} += vmagic_bias; - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = (int32_t) float_as_uint32(vfpacc${C}); - - $for C in range(CHANNEL_TILE): - vout${C} = math_max_s32(vout${C}, vmagic_min); - - $for C in range(CHANNEL_TILE): - vout${C} = math_min_s32(vout${C}, vmagic_max); - - $for C in range(CHANNEL_TILE): - vout${C} -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MAX_F32}(vfpacc${C}, voutput_min_less_zero_point); - - $for C in range(CHANNEL_TILE): - vfpacc${C} = ${MIN_F32}(vfpacc${C}, voutput_max_less_zero_point); - - $for C in range(CHANNEL_TILE): - const int32_t vrndacc${C} = (int32_t) lrintf(vfpacc${C}); - - $for C in range(CHANNEL_TILE): - int32_t vout${C} = vrndacc${C} + voutput_zero_point; - - $for C in range(CHANNEL_TILE): - output[${C}] = (${XINT8_T}) vout${C}; - output += ${CHANNEL_TILE}; - } - $if CHANNEL_TILE == 1: - do { - int32_t vacc = vinit_bias; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_TILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_TILE - 2, ROW_TILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (${XINT8_T}) vout; - } while (--channels != 0); - $else: - if XNN_UNLIKELY(channels != 0) { - $if CHANNEL_TILE == 2: - int32_t vacc = vinit_bias; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}; - - $for M in range(2, ROW_TILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}; - - $for M in range(ROW_TILE - 2, ROW_TILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (${XINT8_T}) vout; - $else: - do { - int32_t vacc = vinit_bias; - $for M in range(2): - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(2, ROW_TILE): - vacc += vi${M-2}; - const int32_t vi${M} = (int32_t) *i${M}++; - - $for M in range(ROW_TILE - 2, ROW_TILE): - vacc += vi${M}; - - float vfpacc = (float) vacc * vscale; - $if VARIANT == "FMAGIC": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - $elif VARIANT == "IMAGIC": - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - $elif VARIANT == "LRINTF": - vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point); - vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (${XINT8_T}) vout; - } while (--channels != 0); - } -} diff --git a/src/qs8-gavgpool/unipass-sse2.c.in b/src/qs8-gavgpool/unipass-sse2.c.in deleted file mode 100644 index 8c7586a04c5..00000000000 --- a/src/qs8-gavgpool/unipass-sse2.c.in +++ /dev/null @@ -1,218 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse2_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(ROW_TILE + 2): - - $for C in range(0, CHANNEL_TILE, 8): - $if M == 3: - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi${M-3}x${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $elif M > 3: - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-2}x${ABC[C:C+8]}); - $if 1 <= M <= ROW_TILE: - $if DATATYPE == "QS8": - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vi${M-1}x${ABC[C:C+8]}), 8); - $else: - const __m128i vxi${M-1}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M-1}x${ABC[C:C+8]}, vzero); - $if M < ROW_TILE: - $if C == 0: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - $else: - const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); - $if M < ROW_TILE: - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[C:C+8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[C:C+8]}); - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vsgnacc${ABC[C:C+8]}); - $else: - __m128i vacc${ABC[C:C+4]} = _mm_unpacklo_epi16(vacc${ABC[C:C+8]}, vzero); - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); - - $for C in range(0, CHANNEL_TILE, 4): - __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 8): - __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); - - $if DATATYPE == "QS8": - $for C in range(0, CHANNEL_TILE, 8): - vout${ABC[C:C+8]} = _mm_max_epi16(vout${ABC[C:C+8]}, voutput_min); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = _mm_max_epu8(vout${ABC[C:C+16]}, voutput_min); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = _mm_max_epu8(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); - - $if CHANNEL_TILE > 8: - _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); - $else: - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); - $else: - _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(ROW_TILE + 3): - - $if M == 4: - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi${M-4}x${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $elif M > 4: - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-3}x${ABC[0:8]}); - $if 2 <= M <= ROW_TILE + 1: - $if DATATYPE == "QS8": - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_srai_epi16(_mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vi${M-2}x${ABC[0:8]}), 8); - $else: - const __m128i vxi${M-2}x${ABC[0:8]} = _mm_unpacklo_epi8(vi${M-2}x${ABC[0:8]}, vzero); - $if M < ROW_TILE: - const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); - i${M} += 8; - - $if DATATYPE == "QS8": - const __m128i vsgnacc${ABC[0:8]} = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc${ABC[0:8]}); - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vsgnacc${ABC[0:8]}); - $else: - __m128i vacc${ABC[0:4]} = _mm_unpacklo_epi16(vacc${ABC[0:8]}, vzero); - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, vzero); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); - - __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); - __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); - - vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); - vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); - - vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); - - __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); - $if DATATYPE == "QS8": - vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min); - - __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); - $if DATATYPE == "QU8": - vout${ABC[0:8]}${ABC[0:8]} = _mm_max_epu8(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); - vout${ABC[0:4]} >>= 16; - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) vout${ABC[0:4]}; - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - uint32_t vout${ABC[0:4]} = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]}); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout${ABC[0:4]}); - vout${ABC[0:4]} >>= 16; - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) vout${ABC[0:4]}; - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/unipass-sse4.c.in b/src/qs8-gavgpool/unipass-sse4.c.in deleted file mode 100644 index 8b0cf87e015..00000000000 --- a/src/qs8-gavgpool/unipass-sse4.c.in +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t" -$_MM_CVTEPX8_EPI16 = {"QS8": "_mm_cvtepi8_epi16", "QU8": "_mm_cvtepu8_epi16"}[DATATYPE] -$_MM_CVTEPX16_EPI32 = {"QS8": "_mm_cvtepi16_epi32", "QU8": "_mm_cvtepu16_epi32"}[DATATYPE] -$_MM_PACKXS_EPI16 = {"QS8": "_mm_packs_epi16", "QU8": "_mm_packus_epi16"}[DATATYPE] -$_MM_MAX_EPX8 = {"QS8": "_mm_max_epi8", "QU8": "_mm_max_epu8"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__sse41_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - $for C in range(8, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+8]} = _mm_add_epi16(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const __m128i vxi2x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i2 + ${C}))); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const __m128i vxi${M}x${ABC[C:C+8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) (i${M} + ${C}))); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = _mm_add_epi16(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); - - $if DATATYPE == "QU8": - const __m128i vzero = _mm_setzero_si128(); - $for C in range(0, CHANNEL_TILE, 8): - __m128i vacc${ABC[C:C+4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[C:C+8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[C+4:C+8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vacc${ABC[C:C+8]}), 16); - $else: - __m128i vacc${ABC[C+4:C+8]} = _mm_unpackhi_epi16(vacc${ABC[C:C+8]}, vzero); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_add_epi32(vacc${ABC[C:C+4]}, vinit_bias); - - $for C in range(0, CHANNEL_TILE, 4): - __m128 vfpacc${ABC[C:C+4]} = _mm_cvtepi32_ps(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_mul_ps(vfpacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vfpacc${ABC[C:C+4]} = _mm_min_ps(vfpacc${ABC[C:C+4]}, voutput_max_less_zero_point); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = _mm_cvtps_epi32(vfpacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 8): - __m128i vout${ABC[C:C+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}), voutput_zero_point); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - __m128i vout${ABC[C:C+16]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - __m128i vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_PACKXS_EPI16}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+16]}, voutput_min); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = ${_MM_MAX_EPX8}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_min); - - $if CHANNEL_TILE > 8: - _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]}); - $else: - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - _mm_storeu_si128((__m128i*) (output + ${C}), vout${ABC[C:C+16]}); - $else: - _mm_storel_epi64((__m128i*) (output + ${C}), vout${ABC[C:C+8]}${ABC[C:C+8]}); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - __m128i vacc${ABC[0:8]} = _mm_add_epi16(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const __m128i vxi2x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const __m128i vxi${M}x${ABC[0:8]} = ${_MM_CVTEPX8_EPI16}(_mm_loadl_epi64((const __m128i*) i${M})); - i${M} += 8; - - vacc${ABC[0:8]} = _mm_add_epi16(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]}); - - __m128i vacc${ABC[0:4]} = ${_MM_CVTEPX16_EPI32}(vacc${ABC[0:8]}); - $if DATATYPE == "QS8": - __m128i vacc${ABC[4:8]} = _mm_srai_epi32(_mm_unpackhi_epi16(vacc${ABC[0:8]}, vacc${ABC[0:8]}), 16); - $else: - __m128i vacc${ABC[4:8]} = _mm_unpackhi_epi16(vacc${ABC[0:8]}, _mm_setzero_si128()); - - vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, vinit_bias); - vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, vinit_bias); - - __m128 vfpacc${ABC[0:4]} = _mm_cvtepi32_ps(vacc${ABC[0:4]}); - __m128 vfpacc${ABC[4:8]} = _mm_cvtepi32_ps(vacc${ABC[4:8]}); - - vfpacc${ABC[0:4]} = _mm_mul_ps(vfpacc${ABC[0:4]}, vscale); - vfpacc${ABC[4:8]} = _mm_mul_ps(vfpacc${ABC[4:8]}, vscale); - - vfpacc${ABC[0:4]} = _mm_min_ps(vfpacc${ABC[0:4]}, voutput_max_less_zero_point); - vfpacc${ABC[4:8]} = _mm_min_ps(vfpacc${ABC[4:8]}, voutput_max_less_zero_point); - - vacc${ABC[0:4]} = _mm_cvtps_epi32(vfpacc${ABC[0:4]}); - vacc${ABC[4:8]} = _mm_cvtps_epi32(vfpacc${ABC[4:8]}); - - __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point); - - __m128i vout${ABC[0:8]}${ABC[0:8]} = ${_MM_PACKXS_EPI16}(vout${ABC[0:8]}, vout${ABC[0:8]}); - vout${ABC[0:8]}${ABC[0:8]} = ${_MM_MAX_EPX8}(vout${ABC[0:8]}${ABC[0:8]}, voutput_min); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]}); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]})); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0)); - vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - *output = (${XINT8_T}) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qs8-gavgpool/unipass-wasmsimd.c.in b/src/qs8-gavgpool/unipass-wasmsimd.c.in deleted file mode 100644 index f0345b058d1..00000000000 --- a/src/qs8-gavgpool/unipass-wasmsimd.c.in +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -$assert DATATYPE in ["QS8", "QU8"] -$assert CHANNEL_TILE % 8 == 0 -$assert CHANNEL_TILE >= 8 -$assert ROW_TILE >= 3 -$assert REQUANTIZATION == "FP32" -$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" -#include - -#include - -#include "xnnpack/gavgpool.h" - - -$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] -$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE] -$WASM_X32X4_EXTEND_LOW_X16X8 = {"QS8": "wasm_i32x4_extend_low_i16x8", "QU8": "wasm_u32x4_extend_low_u16x8"}[DATATYPE] -$WASM_X32X4_EXTEND_HIGH_X16X8 = {"QS8": "wasm_i32x4_extend_high_i16x8", "QU8": "wasm_u32x4_extend_high_u16x8"}[DATATYPE] -$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE] -$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE] -void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}x__wasmsimd_c${CHANNEL_TILE}( - size_t rows, - size_t channels, - const ${XINT8_T}* input, - size_t input_stride, - const ${XINT8_T}* zero, - ${XINT8_T}* output, - const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= ${ROW_TILE}); - assert(channels != 0); - - const ${XINT8_T}* i0 = input; - $for M in range(1, ROW_TILE): - const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); - $if M % 2 == 1: - if XNN_UNPREDICTABLE(rows < ${M+1}) { - i${M} = zero; - } - $else: - if XNN_UNPREDICTABLE(rows <= ${M}) { - i${M} = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - $for C in range(8, CHANNEL_TILE, 8): - v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); - const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); - i2 += ${CHANNEL_TILE}; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - $for C in range(8, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); - const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); - i${M} += ${CHANNEL_TILE}; - - $for C in range(0, CHANNEL_TILE, 8): - vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 8): - v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); - v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[C:C+4]}); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_mul(vacc${ABC[C:C+4]}, vscale); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_f32x4_add(vacc${ABC[C:C+4]}, vmagic_bias); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_i32x4_max(vacc${ABC[C:C+4]}, vmagic_min); - - $for C in range(0, CHANNEL_TILE, 4): - vacc${ABC[C:C+4]} = wasm_i32x4_sub(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); - - $for C in range(0, CHANNEL_TILE, 8): - v128_t vout${ABC[C:C+8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - v128_t vout${ABC[C:C+16]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); - $else: - v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); - - $for C in range(0, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - vout${ABC[C:C+16]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+16]}, voutput_max); - $else: - vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max); - - $if CHANNEL_TILE > 8: - wasm_v128_store(output, vout${ABC[0:16]}); - $else: - wasm_v128_store64_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - $for C in range(16, CHANNEL_TILE, 16): - $if C + 8 < CHANNEL_TILE: - wasm_v128_store(output + ${C}, vout${ABC[C:C+16]}); - $else: - wasm_v128_store64_lane(output + ${C}, vout${ABC[C:C+8]}${ABC[C:C+8]}, 0); - output += ${CHANNEL_TILE}; - } - if XNN_UNLIKELY(channels != 0) { - ${"do " if CHANNEL_TILE > 8 else ""}{ - $for M in range(2): - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); - const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); - i2 += 8; - - $for M in range(3, ROW_TILE): - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); - const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); - i${M} += 8; - - vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_TILE-1}x${ABC[0:8]}); - - v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); - v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); - - vacc${ABC[0:4]} = wasm_f32x4_convert_i32x4(vacc${ABC[0:4]}); - vacc${ABC[4:8]} = wasm_f32x4_convert_i32x4(vacc${ABC[4:8]}); - - vacc${ABC[0:4]} = wasm_f32x4_mul(vacc${ABC[0:4]}, vscale); - vacc${ABC[4:8]} = wasm_f32x4_mul(vacc${ABC[4:8]}, vscale); - - vacc${ABC[0:4]} = wasm_f32x4_add(vacc${ABC[0:4]}, vmagic_bias); - vacc${ABC[4:8]} = wasm_f32x4_add(vacc${ABC[4:8]}, vmagic_bias); - - vacc${ABC[0:4]} = wasm_i32x4_max(vacc${ABC[0:4]}, vmagic_min); - vacc${ABC[4:8]} = wasm_i32x4_max(vacc${ABC[4:8]}, vmagic_min); - - vacc${ABC[0:4]} = wasm_i32x4_sub(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); - vacc${ABC[4:8]} = wasm_i32x4_sub(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); - - const v128_t vout${ABC[0:8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]}); - v128_t vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[0:8]}, vout${ABC[0:8]}); - vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_MIN}(vout${ABC[0:8]}${ABC[0:8]}, voutput_max); - - $if CHANNEL_TILE > 8: - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - output += 1; - } - channels = 0; - } - $else: - if (channels & 4) { - wasm_v128_store32_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout${ABC[0:8]}${ABC[0:8]}, 0); - } - }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c deleted file mode 100644 index a74833921c3..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c16.c +++ /dev/null @@ -1,315 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c deleted file mode 100644 index 0a6f5703d8c..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c24.c +++ /dev/null @@ -1,437 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c deleted file mode 100644 index 78bab632c79..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c32.c +++ /dev/null @@ -1,500 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - const int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - const int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias)); - vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c deleted file mode 100644 index 8bc8b655bdf..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neon-c8.c +++ /dev/null @@ -1,246 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); - const uint8x8_t vi1x01234567 = vld1_u8(i1); - const uint8x8_t vi2x01234567 = vld1_u8(i2); - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c deleted file mode 100644 index 17b15314e71..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c16.c +++ /dev/null @@ -1,310 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c deleted file mode 100644 index 0f305630acd..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c24.c +++ /dev/null @@ -1,431 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c deleted file mode 100644 index 366add84137..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c32.c +++ /dev/null @@ -1,493 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - const int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - const int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR); - vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c deleted file mode 100644 index de02fc77865..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-neonv8-c8.c +++ /dev/null @@ -1,242 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); - const uint8x8_t vi1x01234567 = vld1_u8(i1); - const uint8x8_t vi2x01234567 = vld1_u8(i2); - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c deleted file mode 100644 index c648c4ecf95..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c1.c +++ /dev/null @@ -1,155 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c deleted file mode 100644 index 93266b7ed2c..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c2.c +++ /dev/null @@ -1,261 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c deleted file mode 100644 index 071e4b52297..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-fmagic-c4.c +++ /dev/null @@ -1,367 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c deleted file mode 100644 index 892323eee09..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c1.c +++ /dev/null @@ -1,156 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c deleted file mode 100644 index cfa0e5b2a39..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c2.c +++ /dev/null @@ -1,265 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c deleted file mode 100644 index 7aac3beefba..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-imagic-c4.c +++ /dev/null @@ -1,373 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2); - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - vout2 = math_max_s32(vout2, vmagic_min); - vout3 = math_max_s32(vout3, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - vout2 = math_min_s32(vout2, vmagic_max); - vout3 = math_min_s32(vout3, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - vout2 -= vmagic_bias_less_zero_point; - vout3 -= vmagic_bias_less_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c deleted file mode 100644 index 495fe90cece..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c1.c +++ /dev/null @@ -1,155 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - do { - int32_t vacc = *b; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - *b++ = vacc; - } while (--c != 0); - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c deleted file mode 100644 index f67f024c77f..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c2.c +++ /dev/null @@ -1,261 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - b[0] = vacc0; - b[1] = vacc1; - b += 2; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 2; channels -= 2) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - buffer += 2; - i0 += 2; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = *buffer; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c deleted file mode 100644 index 1f6906999ab..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-scalar-lrintf-c4.c +++ /dev/null @@ -1,367 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t); - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) { - int32_t vacc0 = b[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = b[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = b[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = b[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - b[0] = vacc0; - b[1] = vacc1; - b[2] = vacc2; - b[3] = vacc3; - b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 4; channels -= 4) { - int32_t vacc0 = buffer[0]; - const int32_t vi0x0 = (int32_t) i0[0]; - int32_t vacc1 = buffer[1]; - const int32_t vi0x1 = (int32_t) i0[1]; - int32_t vacc2 = buffer[2]; - const int32_t vi0x2 = (int32_t) i0[2]; - int32_t vacc3 = buffer[3]; - const int32_t vi0x3 = (int32_t) i0[3]; - buffer += 4; - i0 += 4; - - vacc0 += vi0x0; - const int32_t vi1x0 = (int32_t) i1[0]; - vacc1 += vi0x1; - const int32_t vi1x1 = (int32_t) i1[1]; - vacc2 += vi0x2; - const int32_t vi1x2 = (int32_t) i1[2]; - vacc3 += vi0x3; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2); - const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - int32_t vout2 = vrndacc2 + voutput_zero_point; - int32_t vout3 = vrndacc3 + voutput_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = *buffer++; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c deleted file mode 100644 index 0daf638d3cc..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c16.c +++ /dev/null @@ -1,425 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128i vzero = _mm_setzero_si128(); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 16; channels -= 16) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - buffer += 16; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c deleted file mode 100644 index 9ee709f325d..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c24.c +++ /dev/null @@ -1,619 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128i vzero = _mm_setzero_si128(); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 24; channels -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20))); - buffer += 24; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c deleted file mode 100644 index 2febe67cec2..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse2-c8.c +++ /dev/null @@ -1,332 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128i vzero = _mm_setzero_si128(); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - for (; channels >= 8; channels -= 8) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c deleted file mode 100644 index 263d2f34564..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c16.c +++ /dev/null @@ -1,351 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - b += 16; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 16; channels -= 16) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - buffer += 16; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c deleted file mode 100644 index cc8c1e7cd52..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c24.c +++ /dev/null @@ -1,496 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (b + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (b + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (b + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (b + 20))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - _mm_store_si128((__m128i*) (b + 8), vacc89AB); - _mm_store_si128((__m128i*) (b + 12), vaccCDEF); - _mm_store_si128((__m128i*) (b + 16), vaccGHIJ); - _mm_store_si128((__m128i*) (b + 20), vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 24; channels -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - vacc89AB = _mm_add_epi32(vacc89AB, _mm_load_si128((const __m128i*) (buffer + 8))); - vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_load_si128((const __m128i*) (buffer + 12))); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_load_si128((const __m128i*) (buffer + 16))); - vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_load_si128((const __m128i*) (buffer + 20))); - buffer += 24; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c deleted file mode 100644 index 2ab5abdbec1..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-sse41-c8.c +++ /dev/null @@ -1,279 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) b)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (b + 4))); - - _mm_store_si128((__m128i*) b, vacc0123); - _mm_store_si128((__m128i*) (b + 4), vacc4567); - b += 8; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 8; channels -= 8) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, _mm_load_si128((const __m128i*) buffer)); - vacc4567 = _mm_add_epi32(vacc4567, _mm_load_si128((const __m128i*) (buffer + 4))); - buffer += 8; - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c deleted file mode 100644 index 0238d3eeb1f..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c16.c +++ /dev/null @@ -1,350 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - b += 16; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - b += 16; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 16; channels -= 16) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - buffer += 16; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c deleted file mode 100644 index 8095e1ce6fb..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c24.c +++ /dev/null @@ -1,492 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - v128_t vaccGHIJ = wasm_v128_load(b + 16); - v128_t vaccKLMN = wasm_v128_load(b + 20); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - b += 24; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 24; channels -= 24) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - v128_t vaccGHIJ = wasm_v128_load(buffer + 16); - v128_t vaccKLMN = wasm_v128_load(buffer + 20); - buffer += 24; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNGHIJKLMN = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNGHIJKLMN = wasm_u8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store64_lane(output + 16, voutGHIJKLMNGHIJKLMN, 0); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c deleted file mode 100644 index 7302c3fea06..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c32.c +++ /dev/null @@ -1,556 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - const v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - const v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - const v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - const v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - const v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV)); - const v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - wasm_v128_store(b + 24, vaccOPQR); - wasm_v128_store(b + 28, vaccSTUV); - b += 32; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - v128_t vacc89AB = wasm_v128_load(b + 8); - v128_t vaccCDEF = wasm_v128_load(b + 12); - v128_t vaccGHIJ = wasm_v128_load(b + 16); - v128_t vaccKLMN = wasm_v128_load(b + 20); - v128_t vaccOPQR = wasm_v128_load(b + 24); - v128_t vaccSTUV = wasm_v128_load(b + 28); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV)); - vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - wasm_v128_store(b + 8, vacc89AB); - wasm_v128_store(b + 12, vaccCDEF); - wasm_v128_store(b + 16, vaccGHIJ); - wasm_v128_store(b + 20, vaccKLMN); - wasm_v128_store(b + 24, vaccOPQR); - wasm_v128_store(b + 28, vaccSTUV); - b += 32; - } - if XNN_UNLIKELY(c != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 32; channels -= 32) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - v128_t vacc89AB = wasm_v128_load(buffer + 8); - v128_t vaccCDEF = wasm_v128_load(buffer + 12); - v128_t vaccGHIJ = wasm_v128_load(buffer + 16); - v128_t vaccKLMN = wasm_v128_load(buffer + 20); - v128_t vaccOPQR = wasm_v128_load(buffer + 24); - v128_t vaccSTUV = wasm_v128_load(buffer + 28); - buffer += 32; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - vacc89AB = wasm_i32x4_add(vacc89AB, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV)); - vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR); - vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale); - vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias); - vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min); - vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); - output += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c deleted file mode 100644 index 927a35bdad2..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-fp32-wasmsimd-c8.c +++ /dev/null @@ -1,278 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - const v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - const v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(b); - v128_t vacc4567 = wasm_v128_load(b + 4); - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - wasm_v128_store(b, vacc0123); - wasm_v128_store(b + 4, vacc4567); - b += 8; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 8; channels -= 8) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_v128_load(buffer); - v128_t vacc4567 = wasm_v128_load(buffer + 4); - buffer += 8; - - vacc0123 = wasm_i32x4_add(vacc0123, wasm_u32x4_extend_low_u16x8(vacc01234567)); - vacc4567 = wasm_i32x4_add(vacc4567, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c deleted file mode 100644 index a0e84af523d..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c16.c +++ /dev/null @@ -1,311 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 16)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c deleted file mode 100644 index baa5f8a3a86..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c24.c +++ /dev/null @@ -1,432 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 24; c -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c deleted file mode 100644 index f5487ca6834..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c32.c +++ /dev/null @@ -1,494 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - const int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - const int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - const int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - const int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - const int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - const int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c >= 32; c -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(b + 8); - int32x4_t vaccCDEF = vld1q_s32(b + 12); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(b + 16); - int32x4_t vaccKLMN = vld1q_s32(b + 20); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(b + 24); - int32x4_t vaccSTUV = vld1q_s32(b + 28); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - vst1q_s32(b, vacc89AB); b += 4; - vst1q_s32(b, vaccCDEF); b += 4; - vst1q_s32(b, vaccGHIJ); b += 4; - vst1q_s32(b, vaccKLMN); b += 4; - vst1q_s32(b, vaccOPQR); b += 4; - vst1q_s32(b, vaccSTUV); b += 4; - } - if XNN_UNLIKELY(c != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - - c = doz(c, 8); - } while (c != 0); - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - int32x4_t vacc89AB = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccCDEF = vld1q_s32(buffer); buffer += 4; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - int32x4_t vaccGHIJ = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccKLMN = vld1q_s32(buffer); buffer += 4; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - int32x4_t vaccOPQR = vld1q_s32(buffer); buffer += 4; - int32x4_t vaccSTUV = vld1q_s32(buffer); buffer += 4; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc89AB), vget_low_u16(vsum89ABCDEF))); - vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccCDEF), vget_high_u16(vsum89ABCDEF))); - vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccGHIJ), vget_low_u16(vsumGHIJKLMN))); - vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccKLMN), vget_high_u16(vsumGHIJKLMN))); - vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccOPQR), vget_low_u16(vsumOPQRSTUV))); - vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vaccSTUV), vget_high_u16(vsumOPQRSTUV))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - vaccOPQR = vqshlq_s32(vaccOPQR, vleft_pre_shift); - vaccSTUV = vqshlq_s32(vaccSTUV, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - vaccOPQR = vqdmulhq_s32(vaccOPQR, vmultiplier); - vaccSTUV = vqdmulhq_s32(vaccSTUV, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - vaccOPQR = vrshlq_s32(vaccOPQR, vleft_post_shift); - vaccSTUV = vrshlq_s32(vaccSTUV, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c deleted file mode 100644 index 72a98a0a0a6..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7p7x-minmax-rndnu-neon-c8.c +++ /dev/null @@ -1,243 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/multipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows > 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t); - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - const int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - const int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - - for (rows -= 7; rows > 7; rows -= 7) { - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - - int32_t* b = buffer; - size_t c = channels; - for (; c != 0; c = doz(c, 8)) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(b); - int32x4_t vacc4567 = vld1q_s32(b + 4); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vst1q_s32(b, vacc0123); b += 4; - vst1q_s32(b, vacc4567); b += 4; - } - } - - i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment); - i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); - const uint8x8_t vi1x01234567 = vld1_u8(i1); - const uint8x8_t vi2x01234567 = vld1_u8(i2); - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - int32x4_t vacc0123 = vld1q_s32(buffer); buffer += 4; - int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc0123), vget_low_u16(vsum01234567))); - vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vacc4567), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c deleted file mode 100644 index 5645bb93d58..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c16.c +++ /dev/null @@ -1,199 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c deleted file mode 100644 index 7672258d1b4..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c24.c +++ /dev/null @@ -1,229 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c deleted file mode 100644 index cf3a8d7c72d..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c32.c +++ /dev/null @@ -1,254 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - vacc89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc89AB, vmagic_bias)); - vaccCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpaccCDEF, vmagic_bias)); - vaccGHIJ = vreinterpretq_s32_f32(vaddq_f32(vfpaccGHIJ, vmagic_bias)); - vaccKLMN = vreinterpretq_s32_f32(vaddq_f32(vfpaccKLMN, vmagic_bias)); - vaccOPQR = vreinterpretq_s32_f32(vaddq_f32(vfpaccOPQR, vmagic_bias)); - vaccSTUV = vreinterpretq_s32_f32(vaddq_f32(vfpaccSTUV, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = vqsubq_s32(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = vqsubq_s32(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = vqsubq_s32(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = vqsubq_s32(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = vqsubq_s32(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = vqsubq_s32(vaccSTUV, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c deleted file mode 100644 index 3a30000e2a8..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neon-c8.c +++ /dev/null @@ -1,168 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale); - const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias); - const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0123, vmagic_bias)); - vacc4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc4567, vmagic_bias)); - - vacc0123 = vqsubq_s32(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = vqsubq_s32(vacc4567, vmagic_bias_less_output_zero_point); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c deleted file mode 100644 index 1af7cd051ef..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c16.c +++ /dev/null @@ -1,194 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c deleted file mode 100644 index 7dbb6092192..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c24.c +++ /dev/null @@ -1,223 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c deleted file mode 100644 index 57ad205c466..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c32.c +++ /dev/null @@ -1,247 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - float32x4_t vfpacc89AB = vcvtq_f32_s32(vacc89AB); - float32x4_t vfpaccCDEF = vcvtq_f32_s32(vaccCDEF); - float32x4_t vfpaccGHIJ = vcvtq_f32_s32(vaccGHIJ); - float32x4_t vfpaccKLMN = vcvtq_f32_s32(vaccKLMN); - float32x4_t vfpaccOPQR = vcvtq_f32_s32(vaccOPQR); - float32x4_t vfpaccSTUV = vcvtq_f32_s32(vaccSTUV); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - vfpacc89AB = vmulq_f32(vfpacc89AB, vscale); - vfpaccCDEF = vmulq_f32(vfpaccCDEF, vscale); - vfpaccGHIJ = vmulq_f32(vfpaccGHIJ, vscale); - vfpaccKLMN = vmulq_f32(vfpaccKLMN, vscale); - vfpaccOPQR = vmulq_f32(vfpaccOPQR, vscale); - vfpaccSTUV = vmulq_f32(vfpaccSTUV, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - vacc89AB = vcvtnq_s32_f32(vfpacc89AB); - vaccCDEF = vcvtnq_s32_f32(vfpaccCDEF); - vaccGHIJ = vcvtnq_s32_f32(vfpaccGHIJ); - vaccKLMN = vcvtnq_s32_f32(vfpaccKLMN); - vaccOPQR = vcvtnq_s32_f32(vfpaccOPQR); - vaccSTUV = vcvtnq_s32_f32(vfpaccSTUV); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c deleted file mode 100644 index a6a12468cdf..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-neonv8-c8.c +++ /dev/null @@ -1,164 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/intrinsics-polyfill.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias); - const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neonv8.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neonv8.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - float32x4_t vfpacc0123 = vcvtq_f32_s32(vacc0123); - float32x4_t vfpacc4567 = vcvtq_f32_s32(vacc4567); - - vfpacc0123 = vmulq_f32(vfpacc0123, vscale); - vfpacc4567 = vmulq_f32(vfpacc4567, vscale); - - vacc0123 = vcvtnq_s32_f32(vfpacc0123); - vacc4567 = vcvtnq_s32_f32(vfpacc4567); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c deleted file mode 100644 index 8ca8f58c8ed..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c1.c +++ /dev/null @@ -1,88 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c deleted file mode 100644 index b56cc583558..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c2.c +++ /dev/null @@ -1,147 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c deleted file mode 100644 index 4f7de2e738e..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-fmagic-c4.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias; - const float vscale = params->fp32_scalar_fmagic.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point; - const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias; - const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point; - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point; - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point; - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c deleted file mode 100644 index 91a66c1bc0c..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c1.c +++ /dev/null @@ -1,89 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c deleted file mode 100644 index f8a36094718..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c2.c +++ /dev/null @@ -1,151 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c deleted file mode 100644 index 1ba9a70afac..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-imagic-c4.c +++ /dev/null @@ -1,195 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias; - const float vscale = params->fp32_scalar_imagic.scale; - const float vmagic_bias = params->fp32_scalar_imagic.magic_bias; - const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min; - const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max; - const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 += vmagic_bias; - vfpacc1 += vmagic_bias; - vfpacc2 += vmagic_bias; - vfpacc3 += vmagic_bias; - - int32_t vout0 = (int32_t) float_as_uint32(vfpacc0); - int32_t vout1 = (int32_t) float_as_uint32(vfpacc1); - int32_t vout2 = (int32_t) float_as_uint32(vfpacc2); - int32_t vout3 = (int32_t) float_as_uint32(vfpacc3); - - vout0 = math_max_s32(vout0, vmagic_min); - vout1 = math_max_s32(vout1, vmagic_min); - vout2 = math_max_s32(vout2, vmagic_min); - vout3 = math_max_s32(vout3, vmagic_min); - - vout0 = math_min_s32(vout0, vmagic_max); - vout1 = math_min_s32(vout1, vmagic_max); - vout2 = math_min_s32(vout2, vmagic_max); - vout3 = math_min_s32(vout3, vmagic_max); - - vout0 -= vmagic_bias_less_zero_point; - vout1 -= vmagic_bias_less_zero_point; - vout2 -= vmagic_bias_less_zero_point; - vout3 -= vmagic_bias_less_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc += vmagic_bias; - int32_t vout = (int32_t) float_as_uint32(vfpacc); - vout = math_max_s32(vout, vmagic_min); - vout = math_min_s32(vout, vmagic_max); - vout -= vmagic_bias_less_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c deleted file mode 100644 index dd266b0e24d..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c1.c +++ /dev/null @@ -1,88 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c deleted file mode 100644 index a18e29bda4c..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c2.c +++ /dev/null @@ -1,147 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 2; channels -= 2) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - i0 += 2; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - i1 += 2; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - i2 += 2; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - i3 += 2; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - i4 += 2; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - i5 += 2; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - i6 += 2; - - vacc0 += vi6x0; - vacc1 += vi6x1; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output += 2; - } - if XNN_UNLIKELY(channels != 0) { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0; - const int32_t vi1 = (int32_t) *i1; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output = (uint8_t) vout; - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c deleted file mode 100644 index 2f24c64e66b..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-scalar-lrintf-c4.c +++ /dev/null @@ -1,189 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-scalar.c.in -// Generator: tools/xngen -// -// Copyright 2021 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/math.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias; - const float vscale = params->fp32_scalar_lrintf.scale; - const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point; - const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point; - const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point; - for (; channels >= 4; channels -= 4) { - const int32_t vi0x0 = (int32_t) i0[0]; - const int32_t vi0x1 = (int32_t) i0[1]; - const int32_t vi0x2 = (int32_t) i0[2]; - const int32_t vi0x3 = (int32_t) i0[3]; - i0 += 4; - - int32_t vacc0 = vi0x0 + vinit_bias; - const int32_t vi1x0 = (int32_t) i1[0]; - int32_t vacc1 = vi0x1 + vinit_bias; - const int32_t vi1x1 = (int32_t) i1[1]; - int32_t vacc2 = vi0x2 + vinit_bias; - const int32_t vi1x2 = (int32_t) i1[2]; - int32_t vacc3 = vi0x3 + vinit_bias; - const int32_t vi1x3 = (int32_t) i1[3]; - i1 += 4; - - vacc0 += vi1x0; - const int32_t vi2x0 = (int32_t) i2[0]; - vacc1 += vi1x1; - const int32_t vi2x1 = (int32_t) i2[1]; - vacc2 += vi1x2; - const int32_t vi2x2 = (int32_t) i2[2]; - vacc3 += vi1x3; - const int32_t vi2x3 = (int32_t) i2[3]; - i2 += 4; - vacc0 += vi2x0; - const int32_t vi3x0 = (int32_t) i3[0]; - vacc1 += vi2x1; - const int32_t vi3x1 = (int32_t) i3[1]; - vacc2 += vi2x2; - const int32_t vi3x2 = (int32_t) i3[2]; - vacc3 += vi2x3; - const int32_t vi3x3 = (int32_t) i3[3]; - i3 += 4; - vacc0 += vi3x0; - const int32_t vi4x0 = (int32_t) i4[0]; - vacc1 += vi3x1; - const int32_t vi4x1 = (int32_t) i4[1]; - vacc2 += vi3x2; - const int32_t vi4x2 = (int32_t) i4[2]; - vacc3 += vi3x3; - const int32_t vi4x3 = (int32_t) i4[3]; - i4 += 4; - vacc0 += vi4x0; - const int32_t vi5x0 = (int32_t) i5[0]; - vacc1 += vi4x1; - const int32_t vi5x1 = (int32_t) i5[1]; - vacc2 += vi4x2; - const int32_t vi5x2 = (int32_t) i5[2]; - vacc3 += vi4x3; - const int32_t vi5x3 = (int32_t) i5[3]; - i5 += 4; - vacc0 += vi5x0; - const int32_t vi6x0 = (int32_t) i6[0]; - vacc1 += vi5x1; - const int32_t vi6x1 = (int32_t) i6[1]; - vacc2 += vi5x2; - const int32_t vi6x2 = (int32_t) i6[2]; - vacc3 += vi5x3; - const int32_t vi6x3 = (int32_t) i6[3]; - i6 += 4; - - vacc0 += vi6x0; - vacc1 += vi6x1; - vacc2 += vi6x2; - vacc3 += vi6x3; - - float vfpacc0 = (float) vacc0 * vscale; - float vfpacc1 = (float) vacc1 * vscale; - float vfpacc2 = (float) vacc2 * vscale; - float vfpacc3 = (float) vacc3 * vscale; - - vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point); - vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point); - vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point); - vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point); - - vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point); - vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point); - vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point); - vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point); - - const int32_t vrndacc0 = (int32_t) lrintf(vfpacc0); - const int32_t vrndacc1 = (int32_t) lrintf(vfpacc1); - const int32_t vrndacc2 = (int32_t) lrintf(vfpacc2); - const int32_t vrndacc3 = (int32_t) lrintf(vfpacc3); - - int32_t vout0 = vrndacc0 + voutput_zero_point; - int32_t vout1 = vrndacc1 + voutput_zero_point; - int32_t vout2 = vrndacc2 + voutput_zero_point; - int32_t vout3 = vrndacc3 + voutput_zero_point; - - output[0] = (uint8_t) vout0; - output[1] = (uint8_t) vout1; - output[2] = (uint8_t) vout2; - output[3] = (uint8_t) vout3; - output += 4; - } - if XNN_UNLIKELY(channels != 0) { - do { - int32_t vacc = vinit_bias; - const int32_t vi0 = (int32_t) *i0++; - const int32_t vi1 = (int32_t) *i1++; - - vacc += vi0; - const int32_t vi2 = (int32_t) *i2++; - vacc += vi1; - const int32_t vi3 = (int32_t) *i3++; - vacc += vi2; - const int32_t vi4 = (int32_t) *i4++; - vacc += vi3; - const int32_t vi5 = (int32_t) *i5++; - vacc += vi4; - const int32_t vi6 = (int32_t) *i6++; - - vacc += vi5; - vacc += vi6; - - float vfpacc = (float) vacc * vscale; - vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point); - vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point); - const int32_t vrndacc = (int32_t) lrintf(vfpacc); - int32_t vout = vrndacc + voutput_zero_point; - - *output++ = (uint8_t) vout; - } while (--channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c deleted file mode 100644 index b422e8f3fb1..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c16.c +++ /dev/null @@ -1,249 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - const __m128i vzero = _mm_setzero_si128(); - for (; channels >= 16; channels -= 16) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - i0 += 16; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - i1 += 16; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - i2 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - i3 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - i4 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - i5 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c deleted file mode 100644 index 4bbffc98383..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c24.c +++ /dev/null @@ -1,285 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - const __m128i vzero = _mm_setzero_si128(); - for (; channels >= 24; channels -= 24) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); - const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); - i0 += 24; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, vzero); - const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); - const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, vzero); - const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); - i1 += 24; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, vzero); - const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); - const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, vzero); - const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); - i2 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, vzero); - const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, vzero); - const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); - i3 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, vzero); - const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, vzero); - const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); - i4 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, vzero); - const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, vzero); - const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); - i5 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, vzero); - const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, vzero); - const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, vzero); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_unpacklo_epi16(vacc89ABCDEF, vzero); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_unpacklo_epi16(vaccGHIJKLMN, vzero); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c deleted file mode 100644 index bd91beee013..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse2-c8.c +++ /dev/null @@ -1,208 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse2.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min); - const __m128i vzero = _mm_setzero_si128(); - for (; channels >= 8; channels -= 8) { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - - const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); - i0 += 8; - - const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); - i1 += 8; - - const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, vzero); - const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); - i2 += 8; - - const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, vzero); - const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); - i3 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, vzero); - const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); - i4 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, vzero); - const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); - i5 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, vzero); - const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, vzero); - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_unpacklo_epi16(vacc01234567, vzero); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - uint32_t vout0123 = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567); - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) vout0123); - vout0123 >>= 16; - output += 2; - } - if (channels & 1) { - *output = (uint8_t) vout0123; - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c deleted file mode 100644 index 4fe83aeca85..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c16.c +++ /dev/null @@ -1,213 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 16; channels -= 16) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - i0 += 16; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - i1 += 16; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - i2 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - i3 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - i4 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - i5 += 16; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - i6 += 16; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c deleted file mode 100644 index f212ae0ab70..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c24.c +++ /dev/null @@ -1,242 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 24; channels -= 24) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); - const __m128i vxi0xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16))); - i0 += 24; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); - const __m128i vxi1xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16))); - i1 += 24; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - __m128i vacc89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); - const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); - __m128i vaccGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const __m128i vxi2xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16))); - i2 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi2x89ABCDEF); - const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi2xGHIJKLMN); - const __m128i vxi3xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16))); - i3 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi3x89ABCDEF); - const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi3xGHIJKLMN); - const __m128i vxi4xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16))); - i4 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi4x89ABCDEF); - const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi4xGHIJKLMN); - const __m128i vxi5xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16))); - i5 += 24; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi5x89ABCDEF); - const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi5xGHIJKLMN); - const __m128i vxi6xGHIJKLMN = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16))); - i6 += 24; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - vacc89ABCDEF = _mm_add_epi16(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = _mm_add_epi16(vaccGHIJKLMN, vxi6xGHIJKLMN); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - __m128i vacc89AB = _mm_cvtepu16_epi32(vacc89ABCDEF); - __m128i vaccCDEF = _mm_unpackhi_epi16(vacc89ABCDEF, vzero); - __m128i vaccGHIJ = _mm_cvtepu16_epi32(vaccGHIJKLMN); - __m128i vaccKLMN = _mm_unpackhi_epi16(vaccGHIJKLMN, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - vacc89AB = _mm_add_epi32(vacc89AB, vinit_bias); - vaccCDEF = _mm_add_epi32(vaccCDEF, vinit_bias); - vaccGHIJ = _mm_add_epi32(vaccGHIJ, vinit_bias); - vaccKLMN = _mm_add_epi32(vaccKLMN, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - __m128 vfpacc89AB = _mm_cvtepi32_ps(vacc89AB); - __m128 vfpaccCDEF = _mm_cvtepi32_ps(vaccCDEF); - __m128 vfpaccGHIJ = _mm_cvtepi32_ps(vaccGHIJ); - __m128 vfpaccKLMN = _mm_cvtepi32_ps(vaccKLMN); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale); - vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale); - vfpaccGHIJ = _mm_mul_ps(vfpaccGHIJ, vscale); - vfpaccKLMN = _mm_mul_ps(vfpaccKLMN, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - vfpacc89AB = _mm_min_ps(vfpacc89AB, voutput_max_less_zero_point); - vfpaccCDEF = _mm_min_ps(vfpaccCDEF, voutput_max_less_zero_point); - vfpaccGHIJ = _mm_min_ps(vfpaccGHIJ, voutput_max_less_zero_point); - vfpaccKLMN = _mm_min_ps(vfpaccKLMN, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - vacc89AB = _mm_cvtps_epi32(vfpacc89AB); - vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF); - vaccGHIJ = _mm_cvtps_epi32(vfpaccGHIJ); - vaccKLMN = _mm_cvtps_epi32(vfpaccKLMN); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); - __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); - - __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF); - __m128i voutGHIJKLMNGHIJKLMN = _mm_packus_epi16(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNGHIJKLMN = _mm_max_epu8(voutGHIJKLMNGHIJKLMN, voutput_min); - - _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); - _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if XNN_LIKELY(channels >= 8) { - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c deleted file mode 100644 index 5227662b435..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-sse41-c8.c +++ /dev/null @@ -1,179 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-sse4.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" -#include "xnnpack/unaligned.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias); - const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale); - const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point); - const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point); - const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min); - for (; channels >= 8; channels -= 8) { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - const __m128i vzero = _mm_setzero_si128(); - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, vzero); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - _mm_storel_epi64((__m128i*) output, vout0123456701234567); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i0)); - i0 += 8; - const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i1)); - i1 += 8; - - __m128i vacc01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); - const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i2)); - i2 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi2x01234567); - const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i3)); - i3 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi3x01234567); - const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i4)); - i4 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi4x01234567); - const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i5)); - i5 += 8; - vacc01234567 = _mm_add_epi16(vacc01234567, vxi5x01234567); - const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) i6)); - i6 += 8; - - vacc01234567 = _mm_add_epi16(vacc01234567, vxi6x01234567); - - __m128i vacc0123 = _mm_cvtepu16_epi32(vacc01234567); - __m128i vacc4567 = _mm_unpackhi_epi16(vacc01234567, _mm_setzero_si128()); - - vacc0123 = _mm_add_epi32(vacc0123, vinit_bias); - vacc4567 = _mm_add_epi32(vacc4567, vinit_bias); - - __m128 vfpacc0123 = _mm_cvtepi32_ps(vacc0123); - __m128 vfpacc4567 = _mm_cvtepi32_ps(vacc4567); - - vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale); - vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale); - - vfpacc0123 = _mm_min_ps(vfpacc0123, voutput_max_less_zero_point); - vfpacc4567 = _mm_min_ps(vfpacc4567, voutput_max_less_zero_point); - - vacc0123 = _mm_cvtps_epi32(vfpacc0123); - vacc4567 = _mm_cvtps_epi32(vfpacc4567); - - __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); - - __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567); - vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min); - - if (channels & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567)); - vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0)); - vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c deleted file mode 100644 index 4789dbe1371..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c16.c +++ /dev/null @@ -1,211 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 16; channels -= 16) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - i0 += 16; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - i1 += 16; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - i2 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - i3 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - i4 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - i5 += 16; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - i6 += 16; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c deleted file mode 100644 index 4d6d42a4baa..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c24.c +++ /dev/null @@ -1,240 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 24; channels -= 24) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - i0 += 24; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - i1 += 24; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - i2 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - i3 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - i4 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - i5 += 24; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - i6 += 24; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNGHIJKLMN = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutGHIJKLMN); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNGHIJKLMN = wasm_u8x16_min(voutGHIJKLMNGHIJKLMN, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store64_lane(output + 16, voutGHIJKLMNGHIJKLMN, 0); - output += 24; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c deleted file mode 100644 index a884f164edf..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c32.c +++ /dev/null @@ -1,266 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 32; channels -= 32) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - const v128_t vxi0x89ABCDEF = wasm_u16x8_load8x8(i0 + 8); - const v128_t vxi0xGHIJKLMN = wasm_u16x8_load8x8(i0 + 16); - const v128_t vxi0xOPQRSTUV = wasm_u16x8_load8x8(i0 + 24); - i0 += 32; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - const v128_t vxi1x89ABCDEF = wasm_u16x8_load8x8(i1 + 8); - const v128_t vxi1xGHIJKLMN = wasm_u16x8_load8x8(i1 + 16); - const v128_t vxi1xOPQRSTUV = wasm_u16x8_load8x8(i1 + 24); - i1 += 32; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - v128_t vacc89ABCDEF = wasm_i16x8_add(vxi0x89ABCDEF, vxi1x89ABCDEF); - const v128_t vxi2x89ABCDEF = wasm_u16x8_load8x8(i2 + 8); - v128_t vaccGHIJKLMN = wasm_i16x8_add(vxi0xGHIJKLMN, vxi1xGHIJKLMN); - const v128_t vxi2xGHIJKLMN = wasm_u16x8_load8x8(i2 + 16); - v128_t vaccOPQRSTUV = wasm_i16x8_add(vxi0xOPQRSTUV, vxi1xOPQRSTUV); - const v128_t vxi2xOPQRSTUV = wasm_u16x8_load8x8(i2 + 24); - i2 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi2x89ABCDEF); - const v128_t vxi3x89ABCDEF = wasm_u16x8_load8x8(i3 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi2xGHIJKLMN); - const v128_t vxi3xGHIJKLMN = wasm_u16x8_load8x8(i3 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi2xOPQRSTUV); - const v128_t vxi3xOPQRSTUV = wasm_u16x8_load8x8(i3 + 24); - i3 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi3x89ABCDEF); - const v128_t vxi4x89ABCDEF = wasm_u16x8_load8x8(i4 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi3xGHIJKLMN); - const v128_t vxi4xGHIJKLMN = wasm_u16x8_load8x8(i4 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi3xOPQRSTUV); - const v128_t vxi4xOPQRSTUV = wasm_u16x8_load8x8(i4 + 24); - i4 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi4x89ABCDEF); - const v128_t vxi5x89ABCDEF = wasm_u16x8_load8x8(i5 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi4xGHIJKLMN); - const v128_t vxi5xGHIJKLMN = wasm_u16x8_load8x8(i5 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi4xOPQRSTUV); - const v128_t vxi5xOPQRSTUV = wasm_u16x8_load8x8(i5 + 24); - i5 += 32; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi5x89ABCDEF); - const v128_t vxi6x89ABCDEF = wasm_u16x8_load8x8(i6 + 8); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi5xGHIJKLMN); - const v128_t vxi6xGHIJKLMN = wasm_u16x8_load8x8(i6 + 16); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi5xOPQRSTUV); - const v128_t vxi6xOPQRSTUV = wasm_u16x8_load8x8(i6 + 24); - i6 += 32; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - vacc89ABCDEF = wasm_i16x8_add(vacc89ABCDEF, vxi6x89ABCDEF); - vaccGHIJKLMN = wasm_i16x8_add(vaccGHIJKLMN, vxi6xGHIJKLMN); - vaccOPQRSTUV = wasm_i16x8_add(vaccOPQRSTUV, vxi6xOPQRSTUV); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - v128_t vacc89AB = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc89ABCDEF)); - v128_t vaccCDEF = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc89ABCDEF)); - v128_t vaccGHIJ = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccGHIJKLMN)); - v128_t vaccKLMN = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccGHIJKLMN)); - v128_t vaccOPQR = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vaccOPQRSTUV)); - v128_t vaccSTUV = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vaccOPQRSTUV)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - vacc89AB = wasm_f32x4_convert_i32x4(vacc89AB); - vaccCDEF = wasm_f32x4_convert_i32x4(vaccCDEF); - vaccGHIJ = wasm_f32x4_convert_i32x4(vaccGHIJ); - vaccKLMN = wasm_f32x4_convert_i32x4(vaccKLMN); - vaccOPQR = wasm_f32x4_convert_i32x4(vaccOPQR); - vaccSTUV = wasm_f32x4_convert_i32x4(vaccSTUV); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - vacc89AB = wasm_f32x4_mul(vacc89AB, vscale); - vaccCDEF = wasm_f32x4_mul(vaccCDEF, vscale); - vaccGHIJ = wasm_f32x4_mul(vaccGHIJ, vscale); - vaccKLMN = wasm_f32x4_mul(vaccKLMN, vscale); - vaccOPQR = wasm_f32x4_mul(vaccOPQR, vscale); - vaccSTUV = wasm_f32x4_mul(vaccSTUV, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - vacc89AB = wasm_f32x4_add(vacc89AB, vmagic_bias); - vaccCDEF = wasm_f32x4_add(vaccCDEF, vmagic_bias); - vaccGHIJ = wasm_f32x4_add(vaccGHIJ, vmagic_bias); - vaccKLMN = wasm_f32x4_add(vaccKLMN, vmagic_bias); - vaccOPQR = wasm_f32x4_add(vaccOPQR, vmagic_bias); - vaccSTUV = wasm_f32x4_add(vaccSTUV, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - vacc89AB = wasm_i32x4_max(vacc89AB, vmagic_min); - vaccCDEF = wasm_i32x4_max(vaccCDEF, vmagic_min); - vaccGHIJ = wasm_i32x4_max(vaccGHIJ, vmagic_min); - vaccKLMN = wasm_i32x4_max(vaccKLMN, vmagic_min); - vaccOPQR = wasm_i32x4_max(vaccOPQR, vmagic_min); - vaccSTUV = wasm_i32x4_max(vaccSTUV, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - vacc89AB = wasm_i32x4_sub(vacc89AB, vmagic_bias_less_output_zero_point); - vaccCDEF = wasm_i32x4_sub(vaccCDEF, vmagic_bias_less_output_zero_point); - vaccGHIJ = wasm_i32x4_sub(vaccGHIJ, vmagic_bias_less_output_zero_point); - vaccKLMN = wasm_i32x4_sub(vaccKLMN, vmagic_bias_less_output_zero_point); - vaccOPQR = wasm_i32x4_sub(vaccOPQR, vmagic_bias_less_output_zero_point); - vaccSTUV = wasm_i32x4_sub(vaccSTUV, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout89ABCDEF = wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF); - v128_t voutGHIJKLMN = wasm_i16x8_narrow_i32x4(vaccGHIJ, vaccKLMN); - v128_t voutOPQRSTUV = wasm_i16x8_narrow_i32x4(vaccOPQR, vaccSTUV); - - v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF); - v128_t voutGHIJKLMNOPQRSTUV = wasm_u8x16_narrow_i16x8(voutGHIJKLMN, voutOPQRSTUV); - - vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = wasm_u8x16_min(voutGHIJKLMNOPQRSTUV, voutput_max); - - wasm_v128_store(output, vout0123456789ABCDEF); - wasm_v128_store(output + 16, voutGHIJKLMNOPQRSTUV); - output += 32; - } - if XNN_UNLIKELY(channels != 0) { - do { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if XNN_LIKELY(channels >= 8) { - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - channels -= 8; - } else { - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c deleted file mode 100644 index da48872efc2..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-fp32-wasmsimd-c8.c +++ /dev/null @@ -1,177 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-wasmsimd.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); - const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); - const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); - const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); - const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); - const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); - for (; channels >= 8; channels -= 8) { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - wasm_v128_store64_lane(output, vout0123456701234567, 0); - output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const v128_t vxi0x01234567 = wasm_u16x8_load8x8(i0); - i0 += 8; - const v128_t vxi1x01234567 = wasm_u16x8_load8x8(i1); - i1 += 8; - - v128_t vacc01234567 = wasm_i16x8_add(vxi0x01234567, vxi1x01234567); - const v128_t vxi2x01234567 = wasm_u16x8_load8x8(i2); - i2 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi2x01234567); - const v128_t vxi3x01234567 = wasm_u16x8_load8x8(i3); - i3 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi3x01234567); - const v128_t vxi4x01234567 = wasm_u16x8_load8x8(i4); - i4 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi4x01234567); - const v128_t vxi5x01234567 = wasm_u16x8_load8x8(i5); - i5 += 8; - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi5x01234567); - const v128_t vxi6x01234567 = wasm_u16x8_load8x8(i6); - i6 += 8; - - vacc01234567 = wasm_i16x8_add(vacc01234567, vxi6x01234567); - - v128_t vacc0123 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_low_u16x8(vacc01234567)); - v128_t vacc4567 = wasm_i32x4_add(vinit_bias, wasm_u32x4_extend_high_u16x8(vacc01234567)); - - vacc0123 = wasm_f32x4_convert_i32x4(vacc0123); - vacc4567 = wasm_f32x4_convert_i32x4(vacc4567); - - vacc0123 = wasm_f32x4_mul(vacc0123, vscale); - vacc4567 = wasm_f32x4_mul(vacc4567, vscale); - - vacc0123 = wasm_f32x4_add(vacc0123, vmagic_bias); - vacc4567 = wasm_f32x4_add(vacc4567, vmagic_bias); - - vacc0123 = wasm_i32x4_max(vacc0123, vmagic_min); - vacc4567 = wasm_i32x4_max(vacc4567, vmagic_min); - - vacc0123 = wasm_i32x4_sub(vacc0123, vmagic_bias_less_output_zero_point); - vacc4567 = wasm_i32x4_sub(vacc4567, vmagic_bias_less_output_zero_point); - - const v128_t vout01234567 = wasm_i16x8_narrow_i32x4(vacc0123, vacc4567); - v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567); - vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max); - - if (channels & 4) { - wasm_v128_store32_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32); - output += 4; - } - if (channels & 2) { - wasm_v128_store16_lane(output, vout0123456701234567, 0); - vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16); - output += 2; - } - if (channels & 1) { - wasm_v128_store8_lane(output, vout0123456701234567, 0); - } - } - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c deleted file mode 100644 index 5c0e0558f6e..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c16.c +++ /dev/null @@ -1,195 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 16; channels -= 16) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c deleted file mode 100644 index 272fd1370a3..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c24.c +++ /dev/null @@ -1,224 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 24; channels -= 24) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x8_t voutGHIJKLMN = vqmovun_s16(vaccGHIJKLMN); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMN = vmax_u8(voutGHIJKLMN, vget_low_u8(voutput_min)); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMN = vmin_u8(voutGHIJKLMN, vget_low_u8(voutput_max)); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1_u8(output, voutGHIJKLMN); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c deleted file mode 100644 index 6a082250f4c..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c32.c +++ /dev/null @@ -1,248 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 32; channels -= 32) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0x89ABCDEF = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xGHIJKLMN = vld1_u8(i0); i0 += 8; - const uint8x8_t vi0xOPQRSTUV = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1x89ABCDEF = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xGHIJKLMN = vld1_u8(i1); i1 += 8; - const uint8x8_t vi1xOPQRSTUV = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - const uint8x8_t vi2x89ABCDEF = vld1_u8(i2); i2 += 8; - uint16x8_t vsum89ABCDEF = vaddl_u8(vi0x89ABCDEF, vi1x89ABCDEF); - const uint8x8_t vi2xGHIJKLMN = vld1_u8(i2); i2 += 8; - uint16x8_t vsumGHIJKLMN = vaddl_u8(vi0xGHIJKLMN, vi1xGHIJKLMN); - const uint8x8_t vi2xOPQRSTUV = vld1_u8(i2); i2 += 8; - uint16x8_t vsumOPQRSTUV = vaddl_u8(vi0xOPQRSTUV, vi1xOPQRSTUV); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi3x89ABCDEF = vld1_u8(i3); i3 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi2x89ABCDEF); - const uint8x8_t vi3xGHIJKLMN = vld1_u8(i3); i3 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi2xGHIJKLMN); - const uint8x8_t vi3xOPQRSTUV = vld1_u8(i3); i3 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi2xOPQRSTUV); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi4x89ABCDEF = vld1_u8(i4); i4 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi3x89ABCDEF); - const uint8x8_t vi4xGHIJKLMN = vld1_u8(i4); i4 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi3xGHIJKLMN); - const uint8x8_t vi4xOPQRSTUV = vld1_u8(i4); i4 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi3xOPQRSTUV); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi5x89ABCDEF = vld1_u8(i5); i5 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi4x89ABCDEF); - const uint8x8_t vi5xGHIJKLMN = vld1_u8(i5); i5 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi4xGHIJKLMN); - const uint8x8_t vi5xOPQRSTUV = vld1_u8(i5); i5 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi4xOPQRSTUV); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - const uint8x8_t vi6x89ABCDEF = vld1_u8(i6); i6 += 8; - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi5x89ABCDEF); - const uint8x8_t vi6xGHIJKLMN = vld1_u8(i6); i6 += 8; - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi5xGHIJKLMN); - const uint8x8_t vi6xOPQRSTUV = vld1_u8(i6); i6 += 8; - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi5xOPQRSTUV); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - vsum89ABCDEF = vaddw_u8(vsum89ABCDEF, vi6x89ABCDEF); - vsumGHIJKLMN = vaddw_u8(vsumGHIJKLMN, vi6xGHIJKLMN); - vsumOPQRSTUV = vaddw_u8(vsumOPQRSTUV, vi6xOPQRSTUV); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - int32x4_t vacc89AB = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum89ABCDEF))); - int32x4_t vaccCDEF = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum89ABCDEF))); - int32x4_t vaccGHIJ = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumGHIJKLMN))); - int32x4_t vaccKLMN = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumGHIJKLMN))); - int32x4_t vaccOPQR = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsumOPQRSTUV))); - int32x4_t vaccSTUV = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsumOPQRSTUV))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - vacc89AB = vqshlq_s32(vacc89AB, vleft_pre_shift); - vaccCDEF = vqshlq_s32(vaccCDEF, vleft_pre_shift); - vaccGHIJ = vqshlq_s32(vaccGHIJ, vleft_pre_shift); - vaccKLMN = vqshlq_s32(vaccKLMN, vleft_pre_shift); - vaccOPQR = vqshlq_s32(vaccOPQR, vleft_pre_shift); - vaccSTUV = vqshlq_s32(vaccSTUV, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - vacc89AB = vqdmulhq_s32(vacc89AB, vmultiplier); - vaccCDEF = vqdmulhq_s32(vaccCDEF, vmultiplier); - vaccGHIJ = vqdmulhq_s32(vaccGHIJ, vmultiplier); - vaccKLMN = vqdmulhq_s32(vaccKLMN, vmultiplier); - vaccOPQR = vqdmulhq_s32(vaccOPQR, vmultiplier); - vaccSTUV = vqdmulhq_s32(vaccSTUV, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - vacc89AB = vrshlq_s32(vacc89AB, vleft_post_shift); - vaccCDEF = vrshlq_s32(vaccCDEF, vleft_post_shift); - vaccGHIJ = vrshlq_s32(vaccGHIJ, vleft_post_shift); - vaccKLMN = vrshlq_s32(vaccKLMN, vleft_post_shift); - vaccOPQR = vrshlq_s32(vaccOPQR, vleft_post_shift); - vaccSTUV = vrshlq_s32(vaccSTUV, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - int16x8_t vacc89ABCDEF = vqmovn_high_s32(vqmovn_s32(vacc89AB), vaccCDEF); - int16x8_t vaccGHIJKLMN = vqmovn_high_s32(vqmovn_s32(vaccGHIJ), vaccKLMN); - int16x8_t vaccOPQRSTUV = vqmovn_high_s32(vqmovn_s32(vaccOPQR), vaccSTUV); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - int16x8_t vacc89ABCDEF = vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)); - int16x8_t vaccGHIJKLMN = vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)); - int16x8_t vaccOPQRSTUV = vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - vacc89ABCDEF = vqaddq_s16(vacc89ABCDEF, voutput_zero_point); - vaccGHIJKLMN = vqaddq_s16(vaccGHIJKLMN, voutput_zero_point); - vaccOPQRSTUV = vqaddq_s16(vaccOPQRSTUV, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc01234567), vacc89ABCDEF); - uint8x16_t voutGHIJKLMNOPQRSTUV = vqmovun_high_s16(vqmovun_s16(vaccGHIJKLMN), vaccOPQRSTUV); - #else // !XNN_ARCH_ARM64 - uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF)); - uint8x16_t voutGHIJKLMNOPQRSTUV = vcombine_u8(vqmovun_s16(vaccGHIJKLMN), vqmovun_s16(vaccOPQRSTUV)); - #endif // !XNN_ARCH_ARM64 - - vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min); - voutGHIJKLMNOPQRSTUV = vmaxq_u8(voutGHIJKLMNOPQRSTUV, voutput_min); - - vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max); - voutGHIJKLMNOPQRSTUV = vminq_u8(voutGHIJKLMNOPQRSTUV, voutput_max); - - vst1q_u8(output, vout0123456789ABCDEF); output += 16; - vst1q_u8(output, voutGHIJKLMNOPQRSTUV); output += 16; - } - if XNN_UNLIKELY(channels != 0) { - do { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min)); - vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max)); - - if XNN_LIKELY(channels >= 8) { - vst1_u8(output, vout01234567); output += 8; - channels -= 8; - } else { - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); output += 1; - } - channels = 0; - } - } while (channels != 0); - } -} diff --git a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c b/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c deleted file mode 100644 index 75b0d60d8e7..00000000000 --- a/src/qu8-gavgpool/gen/qu8-gavgpool-7x-minmax-rndnu-neon-c8.c +++ /dev/null @@ -1,165 +0,0 @@ -// Auto-generated file. Do not edit! -// Template: src/qs8-gavgpool/unipass-neon.c.in -// Generator: tools/xngen -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/gavgpool.h" - - -void xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS -{ - assert(rows != 0); - assert(rows <= 7); - assert(channels != 0); - - const uint8_t* i0 = input; - const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride); - if XNN_UNPREDICTABLE(rows < 2) { - i1 = zero; - } - const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride); - if XNN_UNPREDICTABLE(rows <= 2) { - i2 = zero; - } - const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride); - if XNN_UNPREDICTABLE(rows < 4) { - i3 = zero; - } - const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride); - if XNN_UNPREDICTABLE(rows <= 4) { - i4 = zero; - } - const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride); - if XNN_UNPREDICTABLE(rows < 6) { - i5 = zero; - } - const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride); - if XNN_UNPREDICTABLE(rows <= 6) { - i6 = zero; - } - - const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias); - const int32x4_t vleft_pre_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_pre_shift); - const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->rndnu_neon.multiplier); - const int32x4_t vleft_post_shift = vld1q_dup_s32(¶ms->rndnu_neon.left_post_shift); - const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->rndnu_neon.output_zero_point); - const uint8x8_t voutput_min = vld1_dup_u8(¶ms->rndnu_neon.output_min); - const uint8x8_t voutput_max = vld1_dup_u8(¶ms->rndnu_neon.output_max); - for (; channels >= 8; channels -= 8) { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else // !XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif // !XNN_ARCH_ARM64 - - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - #if XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #else // !XNN_ARCH_ARM64 - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - #endif // !XNN_ARCH_ARM64 - - vout01234567 = vmax_u8(vout01234567, voutput_min); - - vout01234567 = vmin_u8(vout01234567, voutput_max); - - vst1_u8(output, vout01234567); output += 8; - } - if XNN_UNLIKELY(channels != 0) { - { - const uint8x8_t vi0x01234567 = vld1_u8(i0); i0 += 8; - const uint8x8_t vi1x01234567 = vld1_u8(i1); i1 += 8; - const uint8x8_t vi2x01234567 = vld1_u8(i2); i2 += 8; - uint16x8_t vsum01234567 = vaddl_u8(vi0x01234567, vi1x01234567); - - const uint8x8_t vi3x01234567 = vld1_u8(i3); i3 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi2x01234567); - const uint8x8_t vi4x01234567 = vld1_u8(i4); i4 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi3x01234567); - const uint8x8_t vi5x01234567 = vld1_u8(i5); i5 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi4x01234567); - const uint8x8_t vi6x01234567 = vld1_u8(i6); i6 += 8; - vsum01234567 = vaddw_u8(vsum01234567, vi5x01234567); - vsum01234567 = vaddw_u8(vsum01234567, vi6x01234567); - - int32x4_t vacc0123 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_low_u16(vsum01234567))); - int32x4_t vacc4567 = vreinterpretq_s32_u32(vaddw_u16(vreinterpretq_u32_s32(vinit_bias), vget_high_u16(vsum01234567))); - - vacc0123 = vqshlq_s32(vacc0123, vleft_pre_shift); - vacc4567 = vqshlq_s32(vacc4567, vleft_pre_shift); - - vacc0123 = vqdmulhq_s32(vacc0123, vmultiplier); - vacc4567 = vqdmulhq_s32(vacc4567, vmultiplier); - - vacc0123 = vrshlq_s32(vacc0123, vleft_post_shift); - vacc4567 = vrshlq_s32(vacc4567, vleft_post_shift); - - #if XNN_ARCH_ARM64 - int16x8_t vacc01234567 = vqmovn_high_s32(vqmovn_s32(vacc0123), vacc4567); - #else - int16x8_t vacc01234567 = vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)); - #endif - vacc01234567 = vqaddq_s16(vacc01234567, voutput_zero_point); - - uint8x8_t vout01234567 = vqmovun_s16(vacc01234567); - vout01234567 = vmax_u8(vout01234567, voutput_min); - vout01234567 = vmin_u8(vout01234567, voutput_max); - - if (channels & 4) { - vst1_lane_u32((void*) output, vreinterpret_u32_u8(vout01234567), 0); output += 4; - vout01234567 = vext_u8(vout01234567, vout01234567, 4); - } - if (channels & 2) { - vst1_lane_u16((void*) output, vreinterpret_u16_u8(vout01234567), 0); output += 2; - vout01234567 = vext_u8(vout01234567, vout01234567, 2); - } - if (channels & 1) { - vst1_lane_u8(output, vout01234567, 0); - } - } - } -} diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index aaa73bebcce..1d946242f37 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1084,67 +1084,6 @@ struct pixelwise_average_pooling_context { size_t output_y); #endif -struct global_average_pooling_nwc_context { - const void* input; - const void* zero; - size_t input_pixel_stride; - size_t input_batch_stride; - size_t input_elements; - size_t channels; - void* output; - size_t output_batch_stride; - union { - union xnn_qs8_avgpool_minmax_params qs8; - union xnn_qu8_avgpool_minmax_params qu8; - struct xnn_f16_scaleminmax_params f16; - struct xnn_f32_scaleminmax_params f32; - } params; - union { - xnn_gavgpool_unipass_ukernel_fn unipass_ukernel; - xnn_gavgpool_multipass_ukernel_fn multipass_ukernel; - }; - size_t multipass_batch_stride; - void* multipass_buffer; -}; - -#ifndef __cplusplus - XNN_PRIVATE void xnn_compute_global_average_pooling_nwc_unipass( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index); - - XNN_PRIVATE void xnn_compute_global_average_pooling_nwc_multipass( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index); - - XNN_PRIVATE void xnn_compute_global_average_pooling_nwc_multipass_with_thread( - const struct global_average_pooling_nwc_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t thread_index, - size_t batch_index); -#endif - -struct global_average_pooling_ncw_context { - size_t input_elements; - const void* input; - size_t input_channel_stride; - size_t input_batch_stride; - void* output; - size_t output_channel_stride; - size_t output_batch_stride; - xnn_gavgpool_cw_ukernel_fn ukernel; - union { - union xnn_f16_gavgpool_params f16; - union xnn_f32_gavgpool_params f32; - } params; -}; - -#ifndef __cplusplus - XNN_PRIVATE void xnn_compute_global_average_pooling_ncw( - const struct global_average_pooling_ncw_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t batch_index, - size_t channels_start, - size_t channels_slice); -#endif - struct resize_bilinear_nhwc_indirection_init_context { const void** buffer; const void* input; diff --git a/src/xnnpack/config-types.h b/src/xnnpack/config-types.h index 61b328104d6..c60c268c660 100644 --- a/src/xnnpack/config-types.h +++ b/src/xnnpack/config-types.h @@ -9,9 +9,7 @@ #include #include -#include "xnnpack/common.h" #include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" #ifdef __cplusplus extern "C" { @@ -170,46 +168,6 @@ struct xnn_pavgpool_config { uint16_t channel_tile; }; -struct xnn_gavgpool_config { - xnn_gavgpool_unipass_ukernel_fn unipass; - xnn_gavgpool_multipass_ukernel_fn multipass; - union { - xnn_init_f16_scaleminmax_params_fn f16; - xnn_init_f32_scaleminmax_params_fn f32; - xnn_init_qs8_avgpool_minmax_params_fn qs8; - xnn_init_qu8_avgpool_minmax_params_fn qu8; - } init; - union { - xnn_update_f16_scaleminmax_params_fn f16; - xnn_update_f32_scaleminmax_params_fn f32; - xnn_update_qs8_avgpool_minmax_params_fn qs8; - xnn_update_qu8_avgpool_minmax_params_fn qu8; - } update; - // Number of rows in a tile. - // For best efficiency, micro-kernel must process a multiple of this number of rows in each call. - uint16_t row_tile; - // Number of channels in a tile. - // For best efficiency, micro-kernel must process a multiple of this number of channels in each call. - uint16_t channel_tile; -}; - -struct xnn_gavgpool_cw_config { - xnn_gavgpool_cw_ukernel_fn ukernel; - union { - xnn_init_f16_gavgpool_neon_params_fn f16; - xnn_init_f32_gavgpool_params_fn f32; - } init; - union { - xnn_update_f16_gavgpool_scalar_params_fn f16; - xnn_update_f32_gavgpool_params_fn f32; - } update; - - // Number of input pixels in a tile. - // For best efficiency, micro-kernel must process a multiple of this number of pixels in each call. - uint8_t pixel_tile; - // Channel tile is always 1. -}; - union xnn_dwconv_ukernel { xnn_dwconv_unipass_ukernel_fn unipass; xnn_dwconv_multipass_ukernel_fn multipass; diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index d23151572a8..cbf52c4fd9b 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -129,11 +129,6 @@ XNN_INTERNAL const struct xnn_avgpool_config* xnn_init_qu8_avgpool_config(); XNN_INTERNAL const struct xnn_pavgpool_config* xnn_init_f16_pavgpool_config(); XNN_INTERNAL const struct xnn_pavgpool_config* xnn_init_f32_pavgpool_config(); -XNN_INTERNAL const struct xnn_gavgpool_config* xnn_init_f16_gavgpool_config(); -XNN_INTERNAL const struct xnn_gavgpool_config* xnn_init_f32_gavgpool_config(); -XNN_INTERNAL const struct xnn_gavgpool_config* xnn_init_qs8_gavgpool_config(); -XNN_INTERNAL const struct xnn_gavgpool_config* xnn_init_qu8_gavgpool_config(); - #define XNN_MAX_F16_DWCONV_UKERNELS 4 #define XNN_MAX_F32_DWCONV_UKERNELS 4 #define XNN_MAX_QC8_DWCONV_UKERNELS 3 diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h deleted file mode 100644 index 489f2a382ab..00000000000 --- a/src/xnnpack/gavgpool.h +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/microparams.h" - -#ifdef __cplusplus -extern "C" { -#endif - - -#define DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const float* input, \ - size_t input_stride, \ - const float* zero, \ - float* buffer, \ - float* output, \ - const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4) -DECLARE_F32_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4) - -#define DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const float* input, \ - size_t input_stride, \ - const float* zero, \ - float* output, \ - const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4) -DECLARE_F32_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4) - - -#define DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const xnn_float16* input, \ - size_t input_stride, \ - const xnn_float16* zero, \ - xnn_float16* buffer, \ - xnn_float16* output, \ - const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32) - -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24) -DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32) - - -#define DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const xnn_float16* input, \ - size_t input_stride, \ - const xnn_float16* zero, \ - xnn_float16* output, \ - const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32) - -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24) -DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32) - - -#define DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const int8_t* input, \ - size_t input_stride, \ - const int8_t* zero, \ - int32_t* buffer, \ - int8_t* output, \ - const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4) - -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4) - - -#define DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const int8_t* input, \ - size_t input_stride, \ - const int8_t* zero, \ - int8_t* output, \ - const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4) - -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2) -DECLARE_QS8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4) - - -#define DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const uint8_t* input, \ - size_t input_stride, \ - const uint8_t* zero, \ - int32_t* buffer, \ - uint8_t* output, \ - const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4) - -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4) - - -#define DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t rows, \ - size_t channels, \ - const uint8_t* input, \ - size_t input_stride, \ - const uint8_t* zero, \ - uint8_t* output, \ - const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4) - -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2) -DECLARE_QU8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4) - - -#define DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t elements, \ - size_t channels, \ - const float* input, \ - float* output, \ - const union xnn_f32_gavgpool_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__neon_u4) -DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__scalar_u1) -DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__sse_u4) -DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_u4) -DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_u4) - - -#define DECLARE_F16_GAVGPOOL_CW_UKERNEL_FUNCTION(fn_name) \ - XNN_INTERNAL void fn_name( \ - size_t elements, \ - size_t channels, \ - const xnn_float16* input, \ - xnn_float16* output, \ - const union xnn_f16_gavgpool_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -DECLARE_F16_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f16_gavgpool_cw_ukernel__neonfp16arith_u8) - - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 1ac1c71cf8d..bd1a2108b70 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -837,105 +837,6 @@ typedef void (*xnn_u8_ibilinear_ukernel_fn)( uint8_t* output, size_t output_increment); -// GAVGPOOL: Global AVeraGe POOLing single-pass - -typedef void (*xnn_gavgpool_unipass_ukernel_fn)( - size_t rows, - size_t channels, - const void* input, - size_t input_stride, - const void* zero, - void* output, - const void* params); - -typedef void (*xnn_f16_gavgpool_minmax_unipass_ukernel_fn)( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_f32_gavgpool_minmax_unipass_ukernel_fn)( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* output, - const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_qs8_gavgpool_minmax_unipass_ukernel_fn)( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_qu8_gavgpool_minmax_unipass_ukernel_fn)( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -// GAVGPOOL: Global AVeraGe POOLing multi-pass - -typedef void (*xnn_gavgpool_multipass_ukernel_fn)( - size_t rows, - size_t channels, - const void* input, - size_t input_stride, - const void* zero, - void* buffer, - void* output, - const void* params); - -typedef void (*xnn_f16_gavgpool_minmax_multipass_ukernel_fn)( - size_t rows, - size_t channels, - const xnn_float16* input, - size_t input_stride, - const xnn_float16* zero, - xnn_float16* buffer, - xnn_float16* output, - const struct xnn_f16_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_f32_gavgpool_minmax_multipass_ukernel_fn)( - size_t rows, - size_t channels, - const float* input, - size_t input_stride, - const float* zero, - float* buffer, - float* output, - const struct xnn_f32_scaleminmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_qs8_gavgpool_minmax_multipass_ukernel_fn)( - size_t rows, - size_t channels, - const int8_t* input, - size_t input_stride, - const int8_t* zero, - int32_t* buffer, - int8_t* output, - const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_qu8_gavgpool_minmax_multipass_ukernel_fn)( - size_t rows, - size_t channels, - const uint8_t* input, - size_t input_stride, - const uint8_t* zero, - int32_t* buffer, - uint8_t* output, - const union xnn_qu8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - // AVGPOOL: AVeraGe POOLing single-pass typedef void (*xnn_avgpool_unipass_ukernel_fn)( @@ -2329,29 +2230,6 @@ typedef void (*xnn_f32_ibilinear_chw_ukernel_fn)( float* output, size_t input_increment); -// GAVGPOOL-CW: Global AVeraGe POOLing in CW layout. - -typedef void (*xnn_gavgpool_cw_ukernel_fn)( - size_t batch, - size_t channels, - const float* input, - float* output, - const void* params); - -typedef void (*xnn_f16_gavgpool_cw_ukernel_fn)( - size_t batch, - size_t channels, - const xnn_float16* input, - xnn_float16* output, - const union xnn_f16_gavgpool_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - -typedef void (*xnn_f32_gavgpool_cw_ukernel_fn)( - size_t batch, - size_t channels, - const float* input, - float* output, - const union xnn_f32_gavgpool_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); - /********************* JIT microkernel generator pointers ********************/ @@ -2625,11 +2503,6 @@ typedef void (*xnn_update_qu8_avgpool_minmax_params_fn)( int32_t bias, float scale); -typedef void (*xnn_update_f16_gavgpool_scalar_params_fn)( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint32_t width); - typedef size_t (*xnn_init_qs8_add_minmax_params_fn)( struct xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)], const struct xnn_quantization_params* a_quantization, @@ -2909,25 +2782,6 @@ typedef size_t (*xnn_packed_stride_weights_and_biases_fn)( size_t k_stride, // size_t extra_bytes); -typedef size_t (*xnn_init_f16_gavgpool_neon_params_fn)( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint16_t output_min, - uint16_t output_max, - uint32_t width); - -typedef size_t (*xnn_init_f32_gavgpool_params_fn)( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - float output_min, - float output_max, - uint32_t width); - -typedef void (*xnn_update_f32_gavgpool_params_fn)( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - uint32_t width); - typedef void (*xnn_indirection_init_resize_bilinear2d_hwc_fn)( size_t output_y_start, size_t output_y_end, diff --git a/src/xnnpack/microparams-init.h b/src/xnnpack/microparams-init.h index 7363f4064ce..9da16a7375e 100644 --- a/src/xnnpack/microparams-init.h +++ b/src/xnnpack/microparams-init.h @@ -276,42 +276,6 @@ XNN_INTERNAL void xnn_update_f32_scaleminmax_scalar_params( struct xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)], float scale); -XNN_INTERNAL size_t xnn_init_f16_gavgpool_scalar_params( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint16_t output_min, - uint16_t output_max, - uint32_t width); - -#define DECLARE_INIT_F32_GAVGPOOL_PARAMS_FUNCITON(fn_name) \ - XNN_INTERNAL size_t fn_name( \ - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], \ - float multiplier, \ - float output_min, \ - float output_max, \ - uint32_t width); - -DECLARE_INIT_F32_GAVGPOOL_PARAMS_FUNCITON(xnn_init_f32_gavgpool_scalar_params); -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - DECLARE_INIT_F32_GAVGPOOL_PARAMS_FUNCITON(xnn_init_f32_gavgpool_neon_params); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - DECLARE_INIT_F32_GAVGPOOL_PARAMS_FUNCITON(xnn_init_f32_gavgpool_sse_params); -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 -XNN_INTERNAL void xnn_update_f16_gavgpool_scalar_params( - union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - uint16_t multiplier, - uint32_t width); -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - -XNN_INTERNAL void xnn_update_f32_gavgpool_params( - union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)], - float multiplier, - uint32_t width); - - XNN_INTERNAL size_t xnn_init_s8_minmax_scalar_params( struct xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)], int8_t output_min, diff --git a/src/xnnpack/microparams.h b/src/xnnpack/microparams.h index 7ff41529af6..3c48b858a71 100644 --- a/src/xnnpack/microparams.h +++ b/src/xnnpack/microparams.h @@ -58,7 +58,7 @@ struct xnn_f32_scale_params { }; -// Scale+Min+Max: used by AVGPOOL/GAVGPOOL microkernels. +// Scale+Min+Max: used by AVGPOOL microkernels. struct xnn_f16_scaleminmax_params { struct { @@ -397,7 +397,7 @@ struct xnn_qu8_reduce_minmax_params { } scalar; }; -// AvgPool w. Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation. +// AvgPool w. Min+Max. union xnn_qs8_avgpool_minmax_params { struct { @@ -824,43 +824,6 @@ union xnn_f32_tanh_params { char _; // Dummy member variable to comply with the C standard }; - -// GAvgPool (Global Average Pool): used by GAVGPOOL microkernels in CHW layout with Scale+Min+Max parameters. - -union xnn_f16_gavgpool_params { - struct { - XNN_ALIGN(16) uint16_t mask[8]; - uint16_t multiplier; - uint16_t output_min; - uint16_t output_max; - } scalar; -}; - -union xnn_f32_gavgpool_params { - struct { - XNN_ALIGN(16) int32_t mask[4]; - float multiplier; - float output_min; - float output_max; - } scalar; -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - struct { - XNN_ALIGN(16) float multiplier[4]; - XNN_ALIGN(16) float output_min[4]; - XNN_ALIGN(16) float output_max[4]; - XNN_ALIGN(16) uint32_t mask[4]; - } sse; -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - struct { - XNN_ALIGN(16) uint32_t mask[4]; - float multiplier; - float output_min; - float output_max; - } neon; -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ -}; - struct xnn_qs8_packw_params { int8_t input_zero_point; }; diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 73808204712..ecfcc8ca07e 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -226,9 +226,6 @@ struct xnn_operator { struct xnn_f32_sigmoid_params f32_sigmoid; struct xnn_f32_sqrt_params f32_sqrt; union xnn_f32_tanh_params f32_tanh; - // Parameters for Global Average Pooling in CHW layout - union xnn_f16_gavgpool_params f16_gavgpool; - union xnn_f32_gavgpool_params f32_gavgpool; struct xnn_f32_hswish_params f32_hswish; union xnn_f16_minmax_params f16_minmax; struct xnn_f16_scaleminmax_params f16_scaleminmax; @@ -255,20 +252,14 @@ struct xnn_operator { struct xnn_qu8_f32_cvt_params qu8_f32_cvt; union xnn_qs8_conv_minmax_params qs8_conv_minmax; union xnn_qs8_qc8w_conv_minmax_params qs8_qc8w_conv_minmax; - // Average Pooling normally use qs8_avgpool_params, but also initialize qs8_gavgpool_params in case it needs to switch - // to Global Average Pooling operation. struct { union xnn_qs8_avgpool_minmax_params qs8_avgpool; - union xnn_qs8_avgpool_minmax_params qs8_gavgpool; }; struct xnn_qs8_reduce_minmax_params qs8_reduce; struct xnn_qu8_reduce_minmax_params qu8_reduce; union xnn_qu8_conv_minmax_params qu8_conv_minmax; - // Average Pooling normally use qu8_avgpool_params, but also initialize qu8_gavgpool_params in case it needs to switch - // to Global Average Pooling operation. struct { union xnn_qu8_avgpool_minmax_params qu8_avgpool; - union xnn_qu8_avgpool_minmax_params qu8_gavgpool; }; union xnn_qs8_hswish_params qs8_hswish; union xnn_qu8_hswish_params qu8_hswish; @@ -308,7 +299,6 @@ struct xnn_operator { const struct xnn_argmaxpool_config* argmaxpool_config; struct { const struct xnn_avgpool_config* avgpool_config; - const struct xnn_gavgpool_config* gavgpool_config; const struct xnn_pavgpool_config* pavgpool_config; const struct xnn_reduce_config* rdsum_config; const struct xnn_reduce_config* rsum_config; @@ -316,7 +306,6 @@ struct xnn_operator { const struct xnn_unary_elementwise_config* s32_f32_cvt_config; const struct xnn_unary_elementwise_config* u32_f32_cvt_config; }; - const struct xnn_gavgpool_cw_config* gavgpool_cw_config; const struct xnn_ibilinear_chw_config* ibilinear_chw_config; const struct xnn_ibilinear_config* ibilinear_config; struct { @@ -380,8 +369,6 @@ struct xnn_operator { struct packw_gemm_gio_context packw_gemm_gio; bool const_weights; } gemm; - struct global_average_pooling_nwc_context global_average_pooling_nwc; - struct global_average_pooling_ncw_context global_average_pooling_ncw; struct { struct igemm_context igemm; struct conv2d_igemm_indirection_init_context conv2d_igemm_indirection_init; diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 8ffa3659ce6..3e0ae92279f 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -503,15 +503,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "f16_gavgpool_minmax_test", - srcs = [ - "f16-gavgpool-minmax.cc", - "gavgpool-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "f16_f32acc_gemm_minmax_test", srcs = [ @@ -666,15 +657,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "f32_gavgpool_minmax_test", - srcs = [ - "f32-gavgpool-minmax.cc", - "gavgpool-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "f32_gemm_test", srcs = [ @@ -1021,24 +1003,6 @@ xnnpack_unit_test( ], ) -xnnpack_unit_test( - name = "qs8_gavgpool_minmax_fp32_test", - srcs = [ - "gavgpool-microkernel-tester.h", - "qs8-gavgpool-minmax-fp32.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "qs8_gavgpool_minmax_rndnu_test", - srcs = [ - "gavgpool-microkernel-tester.h", - "qs8-gavgpool-minmax-rndnu.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "qs8_rdsum_minmax_fp32_test", srcs = [ @@ -1073,24 +1037,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "qu8_gavgpool_minmax_fp32_test", - srcs = [ - "gavgpool-microkernel-tester.h", - "qu8-gavgpool-minmax-fp32.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - -xnnpack_unit_test( - name = "qu8_gavgpool_minmax_rndnu_test", - srcs = [ - "gavgpool-microkernel-tester.h", - "qu8-gavgpool-minmax-rndnu.cc", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "qu8_gemm_minmax_fp32_test", srcs = [ diff --git a/test/average-pooling-nhwc.cc b/test/average-pooling-nhwc.cc index 2e8c72ea83d..7ce8544d152 100644 --- a/test/average-pooling-nhwc.cc +++ b/test/average-pooling-nhwc.cc @@ -887,9 +887,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_multithreaded) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -921,9 +921,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 0; padding_left <= 1; padding_left++) { @@ -955,9 +955,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -989,9 +989,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -1043,9 +1043,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -1067,9 +1067,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -1091,9 +1091,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1117,9 +1117,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -1143,9 +1143,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1169,9 +1169,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -1195,9 +1195,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -1219,9 +1219,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -1293,9 +1293,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -1329,9 +1329,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 0; padding_left <= 1; padding_left++) { @@ -1365,9 +1365,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -1401,9 +1401,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -1459,9 +1459,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -1485,9 +1485,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -1511,9 +1511,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1539,9 +1539,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -1567,9 +1567,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1595,9 +1595,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -1623,9 +1623,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -1649,9 +1649,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -1692,9 +1692,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_pool_multithreaded) { /**************************** GAVGPOOL path, unipass ****************************/ TEST(AVERAGE_POOLING_NHWC_QU8, small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -1714,9 +1714,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -1757,9 +1757,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -1800,9 +1800,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -1824,9 +1824,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -1848,9 +1848,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1874,9 +1874,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -1900,9 +1900,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -1926,9 +1926,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -1952,9 +1952,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -1976,9 +1976,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2002,9 +2002,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, small_image_with_qmax) { /**************************** GAVGPOOL path, unipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2026,9 +2026,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -2073,9 +2073,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -2120,9 +2120,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2146,9 +2146,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2172,9 +2172,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2200,9 +2200,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -2228,9 +2228,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2256,9 +2256,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -2284,9 +2284,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2310,9 +2310,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2338,9 +2338,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_small_image_with_qmax) { /**************************** GAVGPOOL path, multipass ****************************/ TEST(AVERAGE_POOLING_NHWC_QU8, large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2360,9 +2360,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -2404,9 +2404,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 1; padding_top <= 2; padding_top++) { AveragePoolingOperatorTester() @@ -2448,9 +2448,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2472,9 +2472,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2496,9 +2496,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2522,9 +2522,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -2548,9 +2548,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2574,9 +2574,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -2600,9 +2600,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2624,9 +2624,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -2650,9 +2650,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, large_image_with_qmax) { /**************************** GAVGPOOL path, multipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2674,9 +2674,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -2722,9 +2722,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 1; padding_top <= 2; padding_top++) { AveragePoolingOperatorTester() @@ -2770,9 +2770,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2796,9 +2796,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2822,9 +2822,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float input_scale = 0.01f; input_scale < 100.0f; input_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2850,9 +2850,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t input_zero_point = 0; input_zero_point <= 255; input_zero_point += 51) { AveragePoolingOperatorTester() @@ -2878,9 +2878,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_input_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_scale) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (float output_scale = 0.01f; output_scale < 100.0f; output_scale *= 3.14159265f) { AveragePoolingOperatorTester() @@ -2906,9 +2906,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_scale) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_zero_point) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (int32_t output_zero_point = 0; output_zero_point <= 255; output_zero_point += 51) { AveragePoolingOperatorTester() @@ -2934,9 +2934,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_output_zero_point) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2960,9 +2960,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -2986,9 +2986,9 @@ TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_with_qmax) { } TEST(AVERAGE_POOLING_NHWC_QU8, batched_large_image_multithreaded) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_qu8_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_qu8_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -3832,9 +3832,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_multithreaded) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -3866,9 +3866,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { @@ -3916,9 +3916,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -3950,9 +3950,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -4004,9 +4004,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -4028,9 +4028,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -4052,9 +4052,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -4076,9 +4076,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -4154,9 +4154,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -4190,9 +4190,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 0; padding_left <= 1; padding_left++) { @@ -4226,9 +4226,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -4262,9 +4262,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -4320,9 +4320,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4346,9 +4346,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4372,9 +4372,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4398,9 +4398,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4443,9 +4443,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_pool_multithreaded) { /**************************** GAVGPOOL path, unipass ****************************/ TEST(AVERAGE_POOLING_NHWC_F16, small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4465,9 +4465,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -4508,9 +4508,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -4551,9 +4551,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4575,9 +4575,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4599,9 +4599,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4623,9 +4623,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4649,9 +4649,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, small_image_with_qmax) { /**************************** GAVGPOOL path, unipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4673,9 +4673,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -4720,9 +4720,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -4767,9 +4767,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4793,9 +4793,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4819,9 +4819,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4845,9 +4845,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -4873,9 +4873,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_small_image_with_qmax) { /**************************** GAVGPOOL path, multipass ****************************/ TEST(AVERAGE_POOLING_NHWC_F16, large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -4895,9 +4895,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -4939,9 +4939,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 1; padding_top <= 2; padding_top++) { AveragePoolingOperatorTester() @@ -4983,9 +4983,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -5007,9 +5007,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -5031,9 +5031,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -5055,9 +5055,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -5081,9 +5081,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, large_image_with_qmax) { /**************************** GAVGPOOL path, multipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -5105,9 +5105,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -5153,9 +5153,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { AveragePoolingOperatorTester() @@ -5201,9 +5201,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -5227,9 +5227,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -5253,9 +5253,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -5279,9 +5279,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -5305,9 +5305,9 @@ TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_with_qmax) { } TEST(AVERAGE_POOLING_NHWC_F16, batched_large_image_multithreaded) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6108,9 +6108,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_multithreaded) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -6142,9 +6142,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { @@ -6192,9 +6192,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -6226,9 +6226,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -6280,9 +6280,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -6304,9 +6304,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -6328,9 +6328,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -6352,9 +6352,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first + 3) @@ -6426,9 +6426,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride_width = 1; stride_width <= 2; stride_width++) { for (size_t stride_height = 1; stride_height <= 2; stride_height++) { @@ -6462,9 +6462,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_left = 0; padding_left <= 1; padding_left++) { @@ -6498,9 +6498,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { @@ -6534,9 +6534,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_tf_same_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t stride = 1; stride <= 2; stride++) { for (size_t input_height = pooling_size.first + 3; input_height <= pooling_size.first + 4; input_height++) { @@ -6592,9 +6592,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_tf_same_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6618,9 +6618,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6644,9 +6644,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6670,9 +6670,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6713,9 +6713,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_pool_multithreaded) { /**************************** GAVGPOOL path, unipass ****************************/ TEST(AVERAGE_POOLING_NHWC_F32, small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -6735,9 +6735,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -6778,9 +6778,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -6821,9 +6821,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -6845,9 +6845,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -6869,9 +6869,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -6893,9 +6893,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -6919,9 +6919,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, small_image_with_qmax) { /**************************** GAVGPOOL path, unipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -6943,9 +6943,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With left padding */ AveragePoolingOperatorTester() @@ -6990,9 +6990,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { /* With top padding */ AveragePoolingOperatorTester() @@ -7037,9 +7037,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7063,9 +7063,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7089,9 +7089,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7115,9 +7115,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = SmallPoolSize(gavgpool_config->row_tile); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = SmallPoolSize(avgpool_config->primary_tile); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7143,9 +7143,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_small_image_with_qmax) { /**************************** GAVGPOOL path, multipass ****************************/ TEST(AVERAGE_POOLING_NHWC_F32, large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -7165,9 +7165,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -7209,9 +7209,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 1; padding_top <= 2; padding_top++) { AveragePoolingOperatorTester() @@ -7253,9 +7253,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -7277,9 +7277,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -7301,9 +7301,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -7325,9 +7325,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .input_height(pooling_size.first) @@ -7351,9 +7351,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, large_image_with_qmax) { /**************************** GAVGPOOL path, multipass, batched ****************************/ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7375,9 +7375,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_width_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_left = 1; padding_left <= 2; padding_left++) { AveragePoolingOperatorTester() @@ -7423,9 +7423,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_width_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_height_padding) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { for (size_t padding_top = 0; padding_top <= 1; padding_top++) { AveragePoolingOperatorTester() @@ -7471,9 +7471,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_height_padding) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_input_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7497,9 +7497,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_input_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_output_stride) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7523,9 +7523,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_output_stride) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_qmin) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7549,9 +7549,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_qmin) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_qmax) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) @@ -7575,9 +7575,9 @@ TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_with_qmax) { } TEST(AVERAGE_POOLING_NHWC_F32, batched_large_image_multithreaded) { - const struct xnn_gavgpool_config* gavgpool_config = xnn_init_f32_gavgpool_config(); - ASSERT_NE(gavgpool_config, nullptr); - const std::pair pooling_size = LargePoolSize(gavgpool_config->row_tile * 2); + const struct xnn_avgpool_config* avgpool_config = xnn_init_f32_avgpool_config(); + ASSERT_NE(avgpool_config, nullptr); + const std::pair pooling_size = LargePoolSize(avgpool_config->primary_tile * 2); for (size_t channels = 1; channels <= 100; channels += 15) { AveragePoolingOperatorTester() .batch_size(2) diff --git a/test/f16-gavgpool-minmax.cc b/test/f16-gavgpool-minmax.cc deleted file mode 100644 index 30c5f957041..00000000000 --- a/test/f16-gavgpool-minmax.cc +++ /dev/null @@ -1,3403 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/f16-gavgpool-minmax.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_FP16_ARITH; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7P7X__F16C_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_eq_8_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_div_8_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_div_8_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_lt_8_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_gt_8_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_eq_16_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_div_16_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_div_16_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_lt_16_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_gt_16_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_eq_24_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_div_24_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_div_24_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_lt_24_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_gt_24_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_eq_32_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_div_32_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_div_32_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_lt_32_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_gt_32_subtile) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } - - TEST(F16_GAVGPOOL_MINMAX_7X__F16C_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_X86_F16C; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32, xnn_init_f16_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/f16-gavgpool-minmax.yaml b/test/f16-gavgpool-minmax.yaml deleted file mode 100644 index 543b7dbf1f4..00000000000 --- a/test/f16-gavgpool-minmax.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON+FP16ARITH -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32 - init: xnn_init_f16_scaleminmax_scalar_params - -# x86 F16C -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c16 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c24 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c32 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c16 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c24 - init: xnn_init_f16_scaleminmax_scalar_params -- name: xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c32 - init: xnn_init_f16_scaleminmax_scalar_params diff --git a/test/f32-gavgpool-minmax.cc b/test/f32-gavgpool-minmax.cc deleted file mode 100644 index 5fbd052c14a..00000000000 --- a/test/f32-gavgpool-minmax.cc +++ /dev/null @@ -1,3285 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/f32-gavgpool-minmax.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_div_4_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_div_4_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_div_4_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_div_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__NEON_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_fulltile) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_multipass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_div_4_2pass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_div_4_2pass_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_div_4_multipass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_div_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_2pass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_2pass_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_multipass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_2pass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_2pass_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_multipass_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__SSE_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_ARM_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASMSIMD_X86_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__WASM_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_2pass_subtile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_eq_1_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_div_1_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_div_1_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_div_1_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_div_1_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C1V, channels_gt_1_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_2pass_subtile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_eq_2_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_div_2_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_div_2_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_div_2_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_div_2_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_lt_2_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C2V, channels_gt_2_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_2pass_subtile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_eq_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_div_4_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_div_4_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_div_4_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_div_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_lt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_2pass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_2pass_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_2pass_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_2pass_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_multipass_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7P7X__RVV_C4V, channels_gt_4_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7P7X__SCALAR_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_eq_4_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_eq_4_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_eq_4_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_eq_4_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_eq_4_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_div_4_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_div_4_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_lt_4_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_lt_4_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_lt_4_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_lt_4_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_gt_4_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_gt_4_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_gt_4_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__NEON_C4, channels_gt_4_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_eq_4_fulltile) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_eq_4_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_eq_4_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_eq_4_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_eq_4_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_div_4_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_div_4_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_lt_4_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_lt_4_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_lt_4_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_lt_4_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_gt_4_fulltile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_gt_4_subtile) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_gt_4_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__SSE_C4, channels_gt_4_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_ARM_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASMSIMD_X86_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__WASM_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_eq_1_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_eq_1_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_eq_1_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_eq_1_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_eq_1_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_gt_1_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_gt_1_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_gt_1_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C1V, channels_gt_1_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_eq_2_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_eq_2_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_eq_2_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_eq_2_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_eq_2_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_div_2_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_div_2_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_lt_2_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_lt_2_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_lt_2_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_lt_2_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_gt_2_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_gt_2_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_gt_2_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C2V, channels_gt_2_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_eq_4_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_eq_4_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_eq_4_fulltile_with_input_stride) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_eq_4_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_eq_4_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_div_4_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_div_4_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_lt_4_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_lt_4_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_lt_4_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_lt_4_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_gt_4_fulltile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_gt_4_subtile) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_gt_4_fulltile_with_qmax) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } - - TEST(F32_GAVGPOOL_MINMAX_7X__RVV_C4V, channels_gt_4_fulltile_with_qmin) { - TEST_REQUIRES_RISCV_VECTOR; - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v, xnn_init_f32_scaleminmax_scalar_params); - } - } -#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV - - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } - } -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} - -TEST(F32_GAVGPOOL_MINMAX_7X__SCALAR_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1, xnn_init_f32_scaleminmax_scalar_params); - } -} \ No newline at end of file diff --git a/test/f32-gavgpool-minmax.yaml b/test/f32-gavgpool-minmax.yaml deleted file mode 100644 index 74747c8194c..00000000000 --- a/test/f32-gavgpool-minmax.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. -# -# ARM NEON -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# x86 SSE -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# WAsm SIMD -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4 - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# WAsm -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1 - init: xnn_init_f32_scaleminmax_scalar_params -# RISC-V Vector -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c1v - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c2v - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__rvv_c4v - init: xnn_init_f32_scaleminmax_scalar_params -# Scalar -- name: xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1 - init: xnn_init_f32_scaleminmax_scalar_params -# ARM NEON -- name: xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# x86 SSE -- name: xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# WAsm SIMD -- name: xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4 - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4 - init: xnn_init_f32_scaleminmax_scalar_params -# WAsm -- name: xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1 - init: xnn_init_f32_scaleminmax_scalar_params -# RISC-V Vector -- name: xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c1v - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c2v - init: xnn_init_f32_scaleminmax_scalar_params -- name: xnn_f32_gavgpool_minmax_ukernel_7x__rvv_c4v - init: xnn_init_f32_scaleminmax_scalar_params -# Scalar -- name: xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1 - init: xnn_init_f32_scaleminmax_scalar_params diff --git a/test/gavgpool-microkernel-tester.h b/test/gavgpool-microkernel-tester.h deleted file mode 100644 index 92a3ba72aef..00000000000 --- a/test/gavgpool-microkernel-tester.h +++ /dev/null @@ -1,651 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/aligned-allocator.h" -#include "xnnpack/math.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/microparams.h" -#include "xnnpack/requantization.h" -#include "replicable_random_device.h" - -class GAvgPoolMicrokernelTester { - public: - GAvgPoolMicrokernelTester& rows(size_t rows) { - assert(rows != 0); - this->rows_ = rows; - return *this; - } - - size_t rows() const { - return this->rows_; - } - - GAvgPoolMicrokernelTester& channels(size_t channels) { - assert(channels != 0); - this->channels_ = channels; - return *this; - } - - size_t channels() const { - return this->channels_; - } - - GAvgPoolMicrokernelTester& channel_tile(size_t channel_tile) { - assert(channel_tile != 0); - this->channel_tile_ = channel_tile; - return *this; - } - - size_t channel_tile() const { - return this->channel_tile_; - } - - GAvgPoolMicrokernelTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - this->input_stride_ = input_stride; - return *this; - } - - size_t input_stride() const { - if (this->input_stride_ == 0) { - return channels(); - } else { - assert(this->input_stride_ >= channels()); - return this->input_stride_; - } - } - - GAvgPoolMicrokernelTester& input_scale(float input_scale) { - assert(input_scale > 0.0f); - assert(std::isnormal(input_scale)); - this->input_scale_ = input_scale; - return *this; - } - - float input_scale() const { - return this->input_scale_; - } - - GAvgPoolMicrokernelTester& input_zero_point(uint8_t input_zero_point) { - this->input_zero_point_ = input_zero_point; - return *this; - } - - uint8_t input_zero_point() const { - return this->input_zero_point_; - } - - GAvgPoolMicrokernelTester& output_scale(float output_scale) { - assert(output_scale > 0.0f); - assert(std::isnormal(output_scale)); - this->output_scale_ = output_scale; - return *this; - } - - float output_scale() const { - return this->output_scale_; - } - - GAvgPoolMicrokernelTester& output_zero_point(uint8_t output_zero_point) { - this->output_zero_point_ = output_zero_point; - return *this; - } - - uint8_t output_zero_point() const { - return this->output_zero_point_; - } - - GAvgPoolMicrokernelTester& qmin(uint8_t qmin) { - this->qmin_ = qmin; - return *this; - } - - uint8_t qmin() const { - return this->qmin_; - } - - GAvgPoolMicrokernelTester& qmax(uint8_t qmax) { - this->qmax_ = qmax; - return *this; - } - - uint8_t qmax() const { - return this->qmax_; - } - - GAvgPoolMicrokernelTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void Test( - xnn_qu8_gavgpool_minmax_unipass_ukernel_fn gavgpool_minmax, - xnn_init_qu8_avgpool_minmax_params_fn init_params, - xnn_qu8_requantize_fn requantize) const - { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - std::vector input(XNN_EXTRA_BYTES / sizeof(uint8_t) + - (rows() - 1) * input_stride() + channels()); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - std::vector output(channels()); - std::vector output_ref(channels()); - std::vector output_fp(channels()); - std::vector accumulators(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - // Prepare parameters. - union xnn_qu8_avgpool_minmax_params params; - init_params( - ¶ms, - -int32_t(input_zero_point()) * int32_t(rows()), - input_scale() / (output_scale() * float(rows())), - output_zero_point(), qmin(), qmax()); - - // Compute reference results. - for (size_t c = 0; c < channels(); c++) { - int32_t acc = 0; - for (size_t n = 0; n < rows(); n++) { - acc += int32_t(input[n * input_stride() + c]) - int32_t(input_zero_point()); - } - accumulators[c] = acc; - output_ref[c] = requantize( - acc, input_scale() / (output_scale() * float(rows())), output_zero_point(), qmin(), qmax()); - output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point()); - output_fp[c] = std::min(output_fp[c], float(qmax())); - output_fp[c] = std::max(output_fp[c], float(qmin())); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(uint8_t), - zero.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(uint32_t(output[c]), uint32_t(qmax())) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(uint32_t(output[c]), uint32_t(qmin())) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.55f) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels() - << ", acc = " << accumulators[c]; - EXPECT_EQ(uint32_t(output_ref[c]), uint32_t(output[c])) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels() - << ", acc = " << accumulators[c]; - } - } - } - - void Test( - xnn_qu8_gavgpool_minmax_multipass_ukernel_fn gavgpool_minmax, - xnn_init_qu8_avgpool_minmax_params_fn init_params, - xnn_qu8_requantize_fn requantize) const - { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - std::vector input(XNN_EXTRA_BYTES / sizeof(uint8_t) + - (rows() - 1) * input_stride() + channels()); - std::vector> buffer(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); - std::vector output(channels()); - std::vector output_ref(channels()); - std::vector output_fp(channels()); - std::vector accumulators(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - // Prepare parameters. - union xnn_qu8_avgpool_minmax_params params; - init_params( - ¶ms, - -int32_t(input_zero_point()) * int32_t(rows()), - input_scale() / (output_scale() * float(rows())), - output_zero_point(), qmin(), qmax()); - - // Compute reference results. - for (size_t c = 0; c < channels(); c++) { - int32_t acc = 0; - for (size_t n = 0; n < rows(); n++) { - acc += int32_t(input[n * input_stride() + c]) - int32_t(input_zero_point()); - } - - accumulators[c] = acc; - output_ref[c] = requantize( - acc, input_scale() / (output_scale() * float(rows())), output_zero_point(), qmin(), qmax()); - output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point()); - output_fp[c] = std::min(output_fp[c], float(qmax())); - output_fp[c] = std::max(output_fp[c], float(qmin())); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(uint8_t), - zero.data(), - buffer.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(uint32_t(output[c]), uint32_t(qmax())) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(uint32_t(output[c]), uint32_t(qmin())) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.55f) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels() - << ", acc = " << accumulators[c]; - EXPECT_EQ(uint32_t(output_ref[c]), uint32_t(output[c])) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels() - << ", acc = " << accumulators[c]; - } - } - } - - void Test( - xnn_qs8_gavgpool_minmax_unipass_ukernel_fn gavgpool_minmax, - xnn_init_qs8_avgpool_minmax_params_fn init_params, - xnn_qs8_requantize_fn requantize) const - { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - std::vector input(XNN_EXTRA_BYTES / sizeof(int8_t) + - (rows() - 1) * input_stride() + channels()); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); - std::vector output(channels()); - std::vector output_ref(channels()); - std::vector output_fp(channels()); - std::vector accumulators(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - // Prepare parameters. - union xnn_qs8_avgpool_minmax_params params; - init_params( - ¶ms, - -int32_t(input_zero_point() - 0x80) * int32_t(rows()), - input_scale() / (output_scale() * float(rows())), - int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); - - // Compute reference results. - for (size_t c = 0; c < channels(); c++) { - int32_t acc = 0; - for (size_t n = 0; n < rows(); n++) { - acc += int32_t(input[n * input_stride() + c]) - int32_t(input_zero_point() - 0x80); - } - accumulators[c] = acc; - output_ref[c] = requantize( - acc, input_scale() / (output_scale() * float(rows())), int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); - output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point() - 0x80); - output_fp[c] = std::min(output_fp[c], float(qmax() - 0x80)); - output_fp[c] = std::max(output_fp[c], float(qmin() - 0x80)); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(int8_t), - zero.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(int32_t(output[c]), int32_t(qmax() - 0x80)) - << "at channel " << c << " / " << channels() << ", rows = " << rows(); - ASSERT_GE(int32_t(output[c]), int32_t(qmin() - 0x80)) - << "at channel " << c << " / " << channels() << ", rows = " << rows(); - ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.55f) - << "at channel " << c << " / " << channels() << ", rows = " << rows() - << ", accumulator = " << accumulators[c]; - EXPECT_EQ(int32_t(output_ref[c]), int32_t(output[c])) - << "at channel " << c << " / " << channels() << ", rows = " << rows() - << ", accumulator = " << accumulators[c]; - } - } - } - - void Test( - xnn_qs8_gavgpool_minmax_multipass_ukernel_fn gavgpool_minmax, - xnn_init_qs8_avgpool_minmax_params_fn init_params, - xnn_qs8_requantize_fn requantize) const - { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution i8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - std::vector input(XNN_EXTRA_BYTES / sizeof(int8_t) + - (rows() - 1) * input_stride() + channels()); - std::vector> buffer(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); - std::vector output(channels()); - std::vector output_ref(channels()); - std::vector output_fp(channels()); - std::vector accumulators(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); - - // Prepare parameters. - union xnn_qs8_avgpool_minmax_params params; - init_params( - ¶ms, - -int32_t(input_zero_point() - 0x80) * int32_t(rows()), - input_scale() / (output_scale() * float(rows())), - int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); - - // Compute reference results. - for (size_t c = 0; c < channels(); c++) { - int32_t acc = 0; - for (size_t n = 0; n < rows(); n++) { - acc += int32_t(input[n * input_stride() + c]) - int32_t(input_zero_point() - 0x80); - } - accumulators[c] = acc; - output_ref[c] = requantize( - acc, input_scale() / (output_scale() * float(rows())), int8_t(output_zero_point() - 0x80), int8_t(qmin() - 0x80), int8_t(qmax() - 0x80)); - output_fp[c] = float(acc) * (input_scale() / (output_scale() * float(rows()))) + float(output_zero_point() - 0x80); - output_fp[c] = std::min(output_fp[c], float(qmax() - 0x80)); - output_fp[c] = std::max(output_fp[c], float(qmin() - 0x80)); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(int8_t), - zero.data(), - buffer.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(int32_t(output[c]), int32_t(qmax() - 0x80)) - << "at channel " << c << " / " << channels() << ", rows = " << rows(); - ASSERT_GE(int32_t(output[c]), int32_t(qmin() - 0x80)) - << "at channel " << c << " / " << channels() << ", rows = " << rows(); - ASSERT_NEAR(float(int32_t(output[c])), output_fp[c], 0.55f) - << "at channel " << c << " / " << channels() << ", rows = " << rows() - << ", accumulator = " << accumulators[c]; - EXPECT_EQ(int32_t(output_ref[c]), int32_t(output[c])) - << "at channel " << c << " / " << channels() << ", rows = " << rows() - << ", accumulator = " << accumulators[c]; - } - } - } - - void Test(xnn_f16_gavgpool_minmax_unipass_ukernel_fn gavgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist; - - std::vector input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - std::vector output(channels()); - std::vector output_ref(channels()); - - std::fill(zero.begin(), zero.end(), 0); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results, without clamping. - for (size_t c = 0; c < channels(); c++) { - float acc = 0.0f; - for (size_t n = 0; n < rows(); n++) { - acc += input[n * input_stride() + c]; - } - output_ref[c] = acc / float(rows()); - } - - // Compute clamping parameters. - const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - const float output_min = xnn_float16(accumulated_min + float(qmin()) / 255.0f * accumulated_range); - const float output_max = xnn_float16(accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range); - - // Clamp reference results. - for (float& output_values : output_ref) { - output_values = std::max(std::min(output_values, output_max), output_min); - } - - // Prepare parameters. - xnn_f16_scaleminmax_params params; - init_params(¶ms, - 1.0f / float(rows()), - output_min, - output_max); - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(xnn_float16), - zero.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(output[c], output_max) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(output[c], output_min) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - EXPECT_NEAR(output[c], output_ref[c], std::max(1.0e-4f, std::abs(output_ref[c]) * 1.0e-2f)) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - } - } - } - - void Test(xnn_f16_gavgpool_minmax_multipass_ukernel_fn gavgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist; - - std::vector input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - std::vector> buffer(channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(xnn_float16)); - std::vector output(channels()); - std::vector output_ref(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results, without clamping. - for (size_t c = 0; c < channels(); c++) { - float acc = 0.0f; - for (size_t n = 0; n < rows(); n++) { - acc += input[n * input_stride() + c]; - } - output_ref[c] = acc / float(rows()); - } - - // Compute clamping parameters. - const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - const float output_min = xnn_float16(accumulated_min + float(qmin()) / 255.0f * accumulated_range); - const float output_max = xnn_float16(accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range); - - // Prepare parameters. - xnn_f16_scaleminmax_params params; - init_params(¶ms, - 1.0f / float(rows()), - output_min, - output_max); - - // Clamp reference results. - for (float& output_values : output_ref) { - output_values = std::max(std::min(output_values, output_max), output_min); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(xnn_float16), - zero.data(), - buffer.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(output[c], output_max) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(output[c], output_min) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - EXPECT_NEAR(output[c], output_ref[c], std::abs(output_ref[c]) * 1.0e-0f) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - } - } - } - - void Test(xnn_f32_gavgpool_minmax_unipass_ukernel_fn gavgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist; - - std::vector input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); - std::vector output(channels()); - std::vector output_ref(channels()); - - std::fill(zero.begin(), zero.end(), 0.0f); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - - // Compute reference results, without clamping. - for (size_t c = 0; c < channels(); c++) { - float acc = 0.0f; - for (size_t n = 0; n < rows(); n++) { - acc += input[n * input_stride() + c]; - } - output_ref[c] = acc / float(rows()); - } - - // Compute clamping parameters. - const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; - const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; - - // Clamp reference results. - for (float& output_values : output_ref) { - output_values = std::max(std::min(output_values, output_max), output_min); - } - - // Prepare parameters. - struct xnn_f32_scaleminmax_params params; - init_params(¶ms, 1.0f / float(rows()), output_min, output_max); - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(float), - zero.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(output[c], output_max) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(output[c], output_min) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - EXPECT_NEAR(output[c], output_ref[c], std::abs(output_ref[c]) * 1.0e-6f) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - } - } - } - - void Test(xnn_f32_gavgpool_minmax_multipass_ukernel_fn gavgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_real_distribution f32dist; - - std::vector input((rows() - 1) * input_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); - std::vector> buffer(channels() + XNN_EXTRA_BYTES / sizeof(float)); - std::vector zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); - std::vector output(channels()); - std::vector output_ref(channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); - std::fill(output.begin(), output.end(), 0.f); // kernels accumulate. - // Compute reference results, without clamping. - for (size_t c = 0; c < channels(); c++) { - float acc = 0.0f; - for (size_t n = 0; n < rows(); n++) { - acc += input[n * input_stride() + c]; - } - output_ref[c] = acc / float(rows()); - } - - // Compute clamping parameters. - const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); - const float accumulated_range = accumulated_max - accumulated_min; - const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; - const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; - - // Prepare parameters. - struct xnn_f32_scaleminmax_params params; - init_params(¶ms, 1.0f / float(rows()), output_min, output_max); - - // Clamp reference results. - for (float& output_values : output_ref) { - output_values = std::max(std::min(output_values, output_max), output_min); - } - - // Call optimized micro-kernel. - gavgpool_minmax(rows(), channels(), - input.data(), input_stride() * sizeof(float), - zero.data(), - buffer.data(), - output.data(), - ¶ms); - - // Verify results. - for (size_t c = 0; c < channels(); c++) { - ASSERT_LE(output[c], output_max) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - ASSERT_GE(output[c], output_min) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - EXPECT_NEAR(output[c], output_ref[c], std::abs(output_ref[c]) * 1.0e-6f) - << "at position " << c << ", rows = " << rows() << ", channels = " << channels(); - } - } - } - - private: - size_t rows_{1}; - size_t channels_{1}; - size_t channel_tile_{1}; - size_t input_stride_{0}; - float input_scale_{1.25f}; - float output_scale_{0.75f}; - uint8_t input_zero_point_{121}; - uint8_t output_zero_point_{133}; - uint8_t qmin_{0}; - uint8_t qmax_{255}; - size_t iterations_{15}; -}; diff --git a/test/qs8-gavgpool-minmax-fp32.cc b/test/qs8-gavgpool-minmax-fp32.cc deleted file mode 100644 index 1d5f7609933..00000000000 --- a/test/qs8-gavgpool-minmax-fp32.cc +++ /dev/null @@ -1,10520 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/qs8-gavgpool-minmax-fp32.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_fp32_neon_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qs8_avgpool_minmax_fp32_neonv8_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_div_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_div_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_div_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_div_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_div_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_div_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qs8_avgpool_minmax_fp32_sse2_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_div_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_div_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_div_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_div_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_div_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_div_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qs8_avgpool_minmax_fp32_sse4_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_2pass_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_2pass_subtile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_multipass_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_subtile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_multipass_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile_with_qmax) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile_with_qmin) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_subtile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_multipass_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_2pass_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_2pass_subtile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_multipass_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_subtile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_multipass_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile_with_qmax) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile_with_qmin) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_subtile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_multipass_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_2pass_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_2pass_subtile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_multipass_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_subtile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_multipass_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile_with_qmax) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile_with_qmin) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_subtile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_multipass_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_2pass_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_2pass_subtile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_multipass_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_subtile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_multipass_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile_with_qmax) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile_with_qmin) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_subtile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_multipass_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_div_8_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_div_8_subtile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_subtile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile_with_qmax) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile_with_qmin) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_subtile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile_with_qmax) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile_with_qmin) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_div_16_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_div_16_subtile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_subtile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile_with_qmax) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile_with_qmin) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_subtile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile_with_qmax) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile_with_qmin) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_div_24_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_div_24_subtile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_subtile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile_with_qmax) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile_with_qmin) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_subtile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile_with_qmax) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile_with_qmin) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_div_32_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_div_32_subtile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_subtile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile_with_qmax) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile_with_qmin) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_subtile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile_with_qmax) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile_with_qmin) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params, xnn_qs8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} - -TEST(QS8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qs8_requantize_fp32); - } -} \ No newline at end of file diff --git a/test/qs8-gavgpool-minmax-fp32.yaml b/test/qs8-gavgpool-minmax-fp32.yaml deleted file mode 100644 index ddb3d29afe8..00000000000 --- a/test/qs8-gavgpool-minmax-fp32.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neon_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_neon_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_neonv8_params -# x86 SSE -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_sse2_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_sse4_params -# WAsm SIMD -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32 - init: xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params -# Scalar -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4 - init: xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params diff --git a/test/qs8-gavgpool-minmax-rndnu.cc b/test/qs8-gavgpool-minmax-rndnu.cc deleted file mode 100644 index 4baeaf4266a..00000000000 --- a/test/qs8-gavgpool-minmax-rndnu.cc +++ /dev/null @@ -1,1711 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/qs8-gavgpool-minmax-rndnu.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } - - TEST(QS8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qs8_avgpool_minmax_rndnu_neon_params, xnn_qs8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/qs8-gavgpool-minmax-rndnu.yaml b/test/qs8-gavgpool-minmax-rndnu.yaml deleted file mode 100644 index e0848611c92..00000000000 --- a/test/qs8-gavgpool-minmax-rndnu.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params -- name: xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32 - init: xnn_init_qs8_avgpool_minmax_rndnu_neon_params diff --git a/test/qu8-gavgpool-minmax-fp32.cc b/test/qu8-gavgpool-minmax-fp32.cc deleted file mode 100644 index db5a9071c96..00000000000 --- a/test/qu8-gavgpool-minmax-fp32.cc +++ /dev/null @@ -1,10520 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/qu8-gavgpool-minmax-fp32.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEON_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__NEONV8_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEON_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_fp32_neon_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__NEONV8_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON_V8; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32, xnn_init_qu8_avgpool_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE2_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SSE41_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_div_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_div_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_div_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_div_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_div_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_div_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_subtile) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE2_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE2; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24, xnn_init_qu8_avgpool_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_div_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_div_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_div_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_div_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_X86 || XNN_ARCH_X86_64 - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_div_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_div_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_subtile) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SSE41_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_X86_SSE41; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24, xnn_init_qu8_avgpool_minmax_fp32_sse4_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_2pass_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_2pass_subtile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_multipass_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_div_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_2pass_subtile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_multipass_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile_with_qmax) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_fulltile_with_qmin) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_2pass_subtile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_multipass_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_2pass_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_2pass_subtile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_multipass_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_div_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_2pass_subtile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_multipass_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile_with_qmax) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_fulltile_with_qmin) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_2pass_subtile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_multipass_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_2pass_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_2pass_subtile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_multipass_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_div_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_2pass_subtile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_multipass_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile_with_qmax) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_fulltile_with_qmin) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_2pass_subtile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_multipass_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_2pass_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_2pass_subtile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_multipass_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_div_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_2pass_subtile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_multipass_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile_with_qmax) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_fulltile_with_qmin) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_2pass_subtile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_multipass_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__WASMSIMD_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_eq_8_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_div_8_fulltile) { - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_div_8_subtile) { - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_subtile) { - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile_with_qmax) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_lt_8_fulltile_with_qmin) { - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_subtile) { - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile_with_qmax) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C8, channels_gt_8_fulltile_with_qmin) { - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_eq_16_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_div_16_fulltile) { - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_div_16_subtile) { - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_subtile) { - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile_with_qmax) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_lt_16_fulltile_with_qmin) { - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_subtile) { - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile_with_qmax) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C16, channels_gt_16_fulltile_with_qmin) { - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_eq_24_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_div_24_fulltile) { - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_div_24_subtile) { - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_subtile) { - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile_with_qmax) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_lt_24_fulltile_with_qmin) { - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_subtile) { - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile_with_qmax) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C24, channels_gt_24_fulltile_with_qmin) { - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_eq_32_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_div_32_fulltile) { - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_div_32_subtile) { - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_subtile) { - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile_with_qmax) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_lt_32_fulltile_with_qmin) { - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_subtile) { - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile_with_qmax) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__WASMSIMD_C32, channels_gt_32_fulltile_with_qmin) { - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32, xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32); - } - } -#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_FMAGIC_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_IMAGIC_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_eq_1_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_2pass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_2pass_subtile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_multipass_fulltile) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_div_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 8; channels += 1) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_2pass_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_multipass_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C1, channels_gt_1_multipass_fulltile_with_input_stride) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_eq_2_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_2pass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_2pass_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_multipass_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_div_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_2pass_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_multipass_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_lt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_2pass_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_multipass_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C2, channels_gt_2_multipass_fulltile_with_input_stride) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(17) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_subtile) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_2pass_subtile_with_input_stride) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_multipass_fulltile) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_eq_4_multipass_fulltile_with_input_stride) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_2pass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_2pass_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_multipass_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_div_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(67) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_2pass_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_multipass_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_lt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_2pass_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_multipass_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7P7X__SCALAR_LRINTF_C4, channels_gt_4_multipass_fulltile_with_input_stride) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(23) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_FMAGIC_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_IMAGIC_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(1) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .input_stride(3) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_eq_1_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(1) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_subtile) { - for (size_t channels = 2; channels < 10; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile_with_qmax) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C1, channels_gt_1_fulltile_with_qmin) { - for (size_t channels = 2; channels < 10; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(2) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .input_stride(5) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_eq_2_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(2) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_div_2_fulltile) { - for (size_t channels = 4; channels < 16; channels += 2) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_div_2_subtile) { - for (size_t channels = 4; channels < 16; channels += 2) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_subtile) { - for (size_t channels = 1; channels < 2; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile_with_qmax) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_lt_2_fulltile_with_qmin) { - for (size_t channels = 1; channels < 2; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_subtile) { - for (size_t channels = 3; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile_with_qmax) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C2, channels_gt_2_fulltile_with_qmin) { - for (size_t channels = 3; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_subtile) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(4) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_input_stride) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .input_stride(7) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_qmax) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_eq_4_fulltile_with_qmin) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(4) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_div_4_fulltile) { - for (size_t channels = 8; channels < 32; channels += 4) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_div_4_subtile) { - for (size_t channels = 8; channels < 32; channels += 4) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_subtile) { - for (size_t channels = 1; channels < 4; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile_with_qmax) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_lt_4_fulltile_with_qmin) { - for (size_t channels = 1; channels < 4; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_subtile) { - for (size_t channels = 5; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile_with_qmax) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} - -TEST(QU8_GAVGPOOL_MINMAX_FP32_7X__SCALAR_LRINTF_C4, channels_gt_4_fulltile_with_qmin) { - for (size_t channels = 5; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4, xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32); - } -} \ No newline at end of file diff --git a/test/qu8-gavgpool-minmax-fp32.yaml b/test/qu8-gavgpool-minmax-fp32.yaml deleted file mode 100644 index bbaa64ec8a3..00000000000 --- a/test/qu8-gavgpool-minmax-fp32.yaml +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neon_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__neonv8_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neon_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_neon_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__neonv8_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_neonv8_params -# x86 SSE -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_sse2_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_sse4_params -# WAsm SIMD -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c8 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c24 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c32 - init: xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params -# Scalar -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_lrintf_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_fmagic_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c1 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c2 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params -- name: xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_lrintf_c4 - init: xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params diff --git a/test/qu8-gavgpool-minmax-rndnu.cc b/test/qu8-gavgpool-minmax-rndnu.cc deleted file mode 100644 index 2385c43ebf0..00000000000 --- a/test/qu8-gavgpool-minmax-rndnu.cc +++ /dev/null @@ -1,1711 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: test/qu8-gavgpool-minmax-rndnu.yaml -// Generator: tools/generate-gavgpool-test.py - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_eq_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_div_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(131) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_lt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C8, channels_gt_8_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_eq_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_div_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(263) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_lt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C16, channels_gt_16_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(47) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_eq_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_div_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(389) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_lt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C24, channels_gt_24_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(61) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(14) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_2pass_subtile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_eq_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_div_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(521) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_lt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 14; rows <= 35; rows += 7) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(14) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_2pass_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 8; rows < 14; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_multipass_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7P7X__NEON_C32, channels_gt_32_multipass_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 14; rows < 35; rows += 14) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(79) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(8) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .input_stride(11) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_eq_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(8) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_div_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_div_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 16; channels < 64; channels += 8) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_lt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 8; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C8, channels_gt_8_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 9; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(16) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .input_stride(19) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_eq_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(16) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_div_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_div_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 32; channels < 128; channels += 16) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_lt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 16; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C16, channels_gt_16_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 17; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(24) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .input_stride(29) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_eq_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(24) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_div_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_div_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 48; channels < 192; channels += 24) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_lt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 24; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C24, channels_gt_24_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 25; channels < 48; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 - - -#if XNN_ARCH_ARM || XNN_ARCH_ARM64 - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(32) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_input_stride) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .input_stride(37) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_eq_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - GAvgPoolMicrokernelTester() - .rows(7) - .channels(32) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_div_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_div_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 64; channels < 256; channels += 32) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_lt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 1; channels < 32; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_subtile) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - for (size_t rows = 1; rows < 7; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile_with_qmax) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmax(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } - - TEST(QU8_GAVGPOOL_MINMAX_RNDNU_7X__NEON_C32, channels_gt_32_fulltile_with_qmin) { - TEST_REQUIRES_ARM_NEON; - for (size_t channels = 33; channels < 64; channels++) { - GAvgPoolMicrokernelTester() - .rows(7) - .channels(channels) - .qmin(128) - .Test(xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32, xnn_init_qu8_avgpool_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); - } - } -#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 diff --git a/test/qu8-gavgpool-minmax-rndnu.yaml b/test/qu8-gavgpool-minmax-rndnu.yaml deleted file mode 100644 index 85e5f573bcb..00000000000 --- a/test/qu8-gavgpool-minmax-rndnu.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2022 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# ARM NEON -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c16 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c24 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c32 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c16 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c24 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params -- name: xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c32 - init: xnn_init_qu8_avgpool_minmax_rndnu_neon_params diff --git a/tools/generate-gavgpool-test.py b/tools/generate-gavgpool-test.py deleted file mode 100755 index 11cea3605f9..00000000000 --- a/tools/generate-gavgpool-test.py +++ /dev/null @@ -1,767 +0,0 @@ -#!/usr/bin/env python -# Copyright 2020 Google LLC -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import argparse -import codecs -import math -import os -import re -import sys -import yaml - -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime -import xngen -import xnncommon - - -parser = argparse.ArgumentParser(description='GAvgPool microkernel test generator') -parser.add_argument("-s", "--spec", metavar="FILE", required=True, - help="Specification (YAML) file") -parser.add_argument("-o", "--output", metavar="FILE", required=True, - help='Output (C++ source) file') -parser.set_defaults(defines=list()) - - -def split_ukernel_name(name): - match = re.match(r"xnn_(qs8|qu8|f16|f32)_(gavgpool|rdsum)(_(minmax))?(_(fp32|rndnu))?_ukernel_((\d+)p)?(\d+)x__(.+)_c(\d+)(_acc(\d+))?(v)?", name) - if match is None: - raise ValueError("Unexpected microkernel name: " + name) - - requantization_type = match.group(6) - if match.group(7): - primary_tile = int(match.group(8)) - incremental_tile = int(match.group(9)) - else: - primary_tile = int(match.group(9)) - incremental_tile = 0 - channel_tile = int(match.group(11)) - vector_tile = bool(match.group(12)) - - arch, isa, assembly = xnncommon.parse_target_name(target_name=match.group(10)) - return requantization_type, primary_tile, incremental_tile, channel_tile, vector_tile, arch, isa - - -AVGPOOL_TEST_TEMPLATE = """\ -$if INCREMENTAL_TILE == 0: - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(${CHANNEL_SCALED_TILE}) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - .input_stride(${next_prime(CHANNEL_TILE+1)}) - $else: - .input_stride(${CHANNEL_SCALED_TILE}+1) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - - $if CHANNEL_TILE > 1 or CHANNEL_SCALED_TILE != CHANNEL_TILE: - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - for (size_t rows = 1; rows < ${PRIMARY_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - } -$else: - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(${CHANNEL_TILE}) - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - .input_stride(${next_prime(CHANNEL_TILE+1)}) - $else: - .input_stride(${CHANNEL_SCALED_TILE}+1) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(${CHANNEL_SCALED_TILE}) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(${CHANNEL_SCALED_TILE}) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_subtile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(${CHANNEL_SCALED_TILE}) - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - .input_stride(${next_prime(CHANNEL_TILE+1)}) - $else: - .input_stride(${CHANNEL_SCALED_TILE}+1) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(${CHANNEL_SCALED_TILE}) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(${CHANNEL_SCALED_TILE}) - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - .input_stride(${next_prime(CHANNEL_TILE+1)}) - $else: - .input_stride(${CHANNEL_SCALED_TILE}+1) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(${next_prime(CHANNEL_TILE*16+1)}) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}*2; channels < ${CHANNEL_SCALED_TILE}*8; channels += ${CHANNEL_SCALED_TILE}) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(${CHANNEL_SCALED_TILE}*16+1) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - $if CHANNEL_TILE > 1 or CHANNEL_SCALED_TILE != CHANNEL_TILE: - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - for (size_t channels = 1; channels < ${CHANNEL_SCALED_TILE}; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows <= ${INCREMENTAL_TILE*5}; rows += ${INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - .input_stride(${next_prime(CHANNEL_TILE+1)}) - $else: - .input_stride(${CHANNEL_SCALED_TILE}+1) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmax) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmax(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_fulltile_with_qmin) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - GAvgPoolMicrokernelTester() - .rows(${PRIMARY_TILE+INCREMENTAL_TILE}) - .channels(channels) - .qmin(128) - .Test(${", ".join(TEST_ARGS)}); - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_2pass_subtile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - for (size_t rows = ${PRIMARY_TILE+1}; rows < ${PRIMARY_TILE+INCREMENTAL_TILE}; rows++) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows < ${INCREMENTAL_TILE*5}; rows += ${PRIMARY_TILE+INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows < ${INCREMENTAL_TILE*5}; rows += ${PRIMARY_TILE+INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - - TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}${CHANNEL_SUFFIX}_multipass_fulltile_with_input_stride) { - $if ISA_CHECK: - ${ISA_CHECK}; - $if CHANNEL_SCALED_TILE == CHANNEL_TILE: - for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows < ${INCREMENTAL_TILE*5}; rows += ${PRIMARY_TILE+INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(${next_prime(CHANNEL_TILE*2+11)}) - .Test(${", ".join(TEST_ARGS)}); - } - } - $else: - for (size_t channels = ${CHANNEL_SCALED_TILE}+1; channels < ${CHANNEL_SCALED_TILE}*2; channels++) { - for (size_t rows = ${PRIMARY_TILE+INCREMENTAL_TILE}; rows < ${INCREMENTAL_TILE*5}; rows += ${PRIMARY_TILE+INCREMENTAL_TILE}) { - GAvgPoolMicrokernelTester() - .rows(rows) - .channels(channels) - .input_stride(${CHANNEL_SCALED_TILE}*2+11) - .Test(${", ".join(TEST_ARGS)}); - } - } - } - -""" - - -def generate_test_cases(ukernel, init_fn, requantization_type, primary_tile, - incremental_tile, channel_tile, vector_tile, isa): - """Generates all tests cases for a GAVGPOOL micro-kernel. - - Args: - ukernel: C name of the micro-kernel function. - init_fn: C name of the function to initialize microkernel parameters. - requantization_type: Requantization type (FP32/RNDNU). - primary_tile: Number of rows (pixels) processed per one iteration of the - primary outer loop of the micro-kernel. - incremental_tile: Number of rows (pixels) processed per one iteration of - the incremental outer loop of the micro-kernel. - channel_tile: Number of channels processed per one iteration of the inner - loops of the micro-kernel. - vector_tile: Indicates if channels are specified in vectors rather than - elements. - isa: instruction set required to run the micro-kernel. Generated unit test - will skip execution if the host processor doesn't support this ISA. - - Returns: - Code for the test case. - """ - _, test_name = ukernel.split("_", 1) - _, datatype, ukernel_type, _ = ukernel.split("_", 3) - test_args = [ukernel, init_fn] - if requantization_type: - test_args.append("xnn_%s_requantize_%s" % \ - (datatype.lower(), requantization_type.lower())) - channel_scaled_tile = channel_tile - if vector_tile: - ctype = {"qs8": "int8_t", "qu8": "uint8_t", "f16": "uint16_t", "f32": "float"}[datatype] - channel_scaled_tile = {"rvv": "(%s*xnn_init_hardware_config()->vlenb/sizeof(%s))" % (str(channel_tile), ctype)}[isa] - return xngen.preprocess(AVGPOOL_TEST_TEMPLATE, { - "TEST_NAME": test_name.upper().replace("UKERNEL_", ""), - "TEST_ARGS": test_args, - "DATATYPE": datatype, - "PRIMARY_TILE": primary_tile, - "INCREMENTAL_TILE": incremental_tile, - "CHANNEL_TILE": channel_tile, - "CHANNEL_SCALED_TILE": channel_scaled_tile, - "CHANNEL_SUFFIX": "v" if vector_tile else "", - "ISA_CHECK": xnncommon.generate_isa_check_macro(isa), - "next_prime": next_prime, - }) - - -def main(args): - options = parser.parse_args(args) - - with codecs.open(options.spec, "r", encoding="utf-8") as spec_file: - spec_yaml = yaml.safe_load(spec_file) - if not isinstance(spec_yaml, list): - raise ValueError("expected a list of micro-kernels in the spec") - - tests = """\ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. -// -// Auto-generated file. Do not edit! -// Specification: {specification} -// Generator: {generator} - - -#include -#include "xnnpack/common.h" -#include "xnnpack/gavgpool.h" -#include "xnnpack/isa-checks.h" -#include "xnnpack/microparams-init.h" -#include "gavgpool-microkernel-tester.h" -""".format(specification=options.spec, generator=sys.argv[0]) - - for ukernel_spec in spec_yaml: - name = ukernel_spec["name"] - init_fn = ukernel_spec.get("init") - requantization_type, primary_tile, incremental_tile, channel_tile, vector_tile, arch, \ - isa = split_ukernel_name(name) - - test_case = generate_test_cases(name, init_fn, requantization_type, - primary_tile, incremental_tile, - channel_tile, vector_tile, isa) - tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa) - - xnncommon.overwrite_if_changed(options.output, tests) - - -if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/tools/generate-rdsum-benchmark.py b/tools/generate-rdsum-benchmark.py index 3a81702ded2..0f62538396c 100755 --- a/tools/generate-rdsum-benchmark.py +++ b/tools/generate-rdsum-benchmark.py @@ -6,19 +6,17 @@ import argparse import codecs -import math import os import re import sys import yaml sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from primes import next_prime import xngen import xnncommon -parser = argparse.ArgumentParser(description='GAvgPool microkernel test generator') +parser = argparse.ArgumentParser(description="RDsum microkernel test generator") parser.add_argument("-s", "--spec", metavar="FILE", required=True, help="Specification (YAML) file") parser.add_argument(