diff --git a/bench/packw-benchmark.h b/bench/packw-benchmark.h index 86326bc1256..db54553e2bb 100644 --- a/bench/packw-benchmark.h +++ b/bench/packw-benchmark.h @@ -76,6 +76,66 @@ static void x8_packw(benchmark::State& state, benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } +static void x8_gio_packw(benchmark::State& state, + xnn_x8_packw_gemm_gio_ukernel_fn packw, + size_t nr, size_t kr, size_t sr, + benchmark::utils::IsaCheckFunction isa_check = nullptr) +{ + if (isa_check != nullptr && !isa_check(state)) { + return; + } + + const size_t batch = state.range(0); // batch is g parameter for packw + const size_t dim_n = state.range(2); // dim_n is nc parameter + const size_t dim_k = state.range(3); // dim_k is kc parameter + + const size_t rounded_n = benchmark::utils::RoundUp(dim_n, nr); + const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr * sr); + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + + // Computer num_buffers that fit cache with source weights + packed_weights. + const size_t num_buffers = 1 + + benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), + sizeof(int8_t) * batch * (dim_n * dim_k + rounded_n * rounded_k + rounded_n)); + + xnnpack::Buffer weights(num_buffers * batch * + dim_n * dim_k); + xnnpack::fill_uniform_random_bits(weights.data(), weights.size(), rng); + xnnpack::Buffer packed_weights( + num_buffers * batch * + (rounded_n * rounded_k + rounded_n * sizeof(uint32_t))); + + const xnn_qs8_packw_params params = {127}; + + size_t buffer_index = 0; + for (auto _ : state) { + if (++buffer_index == num_buffers) { + buffer_index = 0; + } + + packw(batch, dim_n, dim_k, nr, kr, sr, dim_n /* k_stride */, + weights.data() + buffer_index * batch * dim_n * dim_k, + /*bias=*/nullptr, /*scale=*/nullptr, + packed_weights.data() + buffer_index * batch * (rounded_n * rounded_k + rounded_n), + /*extra_bytes=*/0, ¶ms); + } + + const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); + if (cpu_frequency != 0) { + state.counters["cpufreq"] = cpu_frequency; + } + + const size_t elements_per_iteration = batch * dim_n * dim_k; + state.counters["elements"] = + benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); + + const size_t bytes_per_iteration = (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) * sizeof(int8_t); + state.counters["bytes"] = + benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); +} + static void qs8_packw(benchmark::State& state, xnn_qs8_packw_gemm_goi_ukernel_fn packw, size_t nr, size_t kr, size_t sr, @@ -136,6 +196,66 @@ static void qs8_packw(benchmark::State& state, benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } +static void qs8_gio_packw(benchmark::State& state, + xnn_qs8_packw_gemm_gio_ukernel_fn packw, + size_t nr, size_t kr, size_t sr, + benchmark::utils::IsaCheckFunction isa_check = nullptr) +{ + if (isa_check != nullptr && !isa_check(state)) { + return; + } + + const size_t batch = state.range(0); // batch is g parameter for packw + const size_t dim_n = state.range(2); // dim_n is nc parameter + const size_t dim_k = state.range(3); // dim_k is kc parameter + + const size_t rounded_n = benchmark::utils::RoundUp(dim_n, nr); + const size_t rounded_k = benchmark::utils::RoundUp(dim_k, kr * sr); + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + + // Computer num_buffers that fit cache with source weights + packed_weights. + const size_t num_buffers = 1 + + benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), + sizeof(int8_t) * batch * (dim_n * dim_k + rounded_n * rounded_k + rounded_n)); + + xnnpack::Buffer weights(num_buffers * batch * + dim_n * dim_k); + xnnpack::fill_uniform_random_bits(weights.data(), weights.size(), rng); + xnnpack::Buffer packed_weights( + num_buffers * batch * + (rounded_n * rounded_k + rounded_n * sizeof(uint32_t))); + + const xnn_qs8_packw_params params = {127}; + + size_t buffer_index = 0; + for (auto _ : state) { + if (++buffer_index == num_buffers) { + buffer_index = 0; + } + + packw(batch, dim_n, dim_k, nr, kr, sr, dim_n, + weights.data() + buffer_index * batch * dim_n * dim_k, + /*bias=*/nullptr, /*scale=*/nullptr, + packed_weights.data() + buffer_index * batch * (rounded_n * rounded_k + rounded_n), + /*extra_bytes=*/0, ¶ms); + } + + const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); + if (cpu_frequency != 0) { + state.counters["cpufreq"] = cpu_frequency; + } + + const size_t elements_per_iteration = batch * dim_n * dim_k; + state.counters["elements"] = + benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); + + const size_t bytes_per_iteration = (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n)) * sizeof(int8_t); + state.counters["bytes"] = + benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); +} + static void x16_packw(benchmark::State& state, xnn_x16_packw_gemm_goi_ukernel_fn packw, size_t nr, size_t kr, size_t sr, @@ -365,6 +485,67 @@ BENCHMARK_BGEMM(x8_packw_x8__reference) BENCHMARK_BGEMM(x8_packw_x16__reference) BENCHMARK_BGEMM(x8_packw_x32__reference) +static void x8_packw_gio__reference( + size_t batch, + size_t dim_n, + size_t dim_k, + size_t nr, + size_t kr, + size_t sr, + const int8_t* weights, + const uint32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + xnn_pack_f32_qs8w_gemm_gio_w(batch, dim_n, dim_k, nr, kr, sr, dim_n, + reinterpret_cast(weights), + reinterpret_cast(bias), + static_cast(scale), + static_cast(packed_weights), + extra_bytes, params); +} + +static void x8_packw_gio_x2__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/2, /*kr=*/1, /*sr=*/1); +} +static void x8_packw_gio_x4__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/4, /*kr=*/1, /*sr=*/1); +} +static void x8_packw_gio_x8__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/8, /*kr=*/1, /*sr=*/1); +} +static void x8_packw_gio_x16__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/16, /*kr=*/1, /*sr=*/1); +} +static void x8_packw_gio_x32__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/32, /*kr=*/1, /*sr=*/1); +} + +static void x8_packw_gio_x8c8__reference(benchmark::State& state, const char* net) { + x8_packw(state, + x8_packw_gio__reference, + /*nr=*/8, /*kr=*/8, /*sr=*/1); +} + +BENCHMARK_BGEMM(x8_packw_gio_x2__reference) +BENCHMARK_BGEMM(x8_packw_gio_x4__reference) +BENCHMARK_BGEMM(x8_packw_gio_x8__reference) +BENCHMARK_BGEMM(x8_packw_gio_x16__reference) +BENCHMARK_BGEMM(x8_packw_gio_x32__reference) +BENCHMARK_BGEMM(x8_packw_gio_x8c8__reference) + static void qs8_packw__reference( size_t batch, size_t dim_n, @@ -428,6 +609,43 @@ static void qs8_packw_x16c8__reference(benchmark::State& state, const char* net) BENCHMARK_BGEMM(qs8_packw_x8c8__reference) BENCHMARK_BGEMM(qs8_packw_x16c8__reference) +static void qs8_packw_gio__reference( + size_t batch, + size_t dim_n, + size_t dim_k, + size_t nr, + size_t kr, + size_t sr, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + xnn_pack_qs8_gemm_gio_w(batch, dim_n, dim_k, nr, kr, sr, dim_n, + reinterpret_cast(weights), + reinterpret_cast(bias), + static_cast(scale), + static_cast(packed_weights), + extra_bytes, + reinterpret_cast(params)); +} + +static void qs8_packw_gio_x8c8__reference(benchmark::State& state, const char* net) { + qs8_packw(state, + qs8_packw_gio__reference, + /*nr=*/8, /*kr=*/8, /*sr=*/1); +} +static void qs8_packw_gio_x16c8__reference(benchmark::State& state, const char* net) { + qs8_packw(state, + qs8_packw_gio__reference, + /*nr=*/16, /*kr=*/8, /*sr=*/1); +} + +BENCHMARK_BGEMM(qs8_packw_gio_x8c8__reference) +BENCHMARK_BGEMM(qs8_packw_gio_x16c8__reference) + static void x16_packw__reference( size_t batch, size_t dim_n, diff --git a/bench/qs8-packw.cc b/bench/qs8-packw.cc index dcd8971c795..e4d8ee97cbb 100644 --- a/bench/qs8-packw.cc +++ b/bench/qs8-packw.cc @@ -19,9 +19,19 @@ static void qs8_packw(benchmark::State& state, const char* net, qs8_packw(state, ukernel, nr, kr, sr); } +static void qs8_gio_packw(benchmark::State& state, const char* net, + xnn_qs8_packw_gemm_gio_ukernel_fn ukernel, + uint64_t arch_flags, size_t nr, size_t kr, size_t sr) { + benchmark::utils::CheckArchFlags(state, arch_flags); + qs8_gio_packw(state, ukernel, nr, kr, sr); +} + #define XNN_QS8_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) \ BENCHMARK_CAPTURE_BGEMM(qs8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); +#define XNN_QS8_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) \ +BENCHMARK_CAPTURE_BGEMM(qs8_gio_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); + #include "qs8-packw/qs8-packw.h" #undef XNN_QS8_UKERNEL diff --git a/bench/x8-packw.cc b/bench/x8-packw.cc index 439fcf3f1c0..013a197d50b 100644 --- a/bench/x8-packw.cc +++ b/bench/x8-packw.cc @@ -19,9 +19,19 @@ static void x8_packw(benchmark::State& state, const char* net, x8_packw(state, ukernel, nr, kr, sr); } +static void x8_gio_packw(benchmark::State& state, const char* net, + xnn_x8_packw_gemm_gio_ukernel_fn ukernel, + uint64_t arch_flags, size_t nr, size_t kr, size_t sr) { + benchmark::utils::CheckArchFlags(state, arch_flags); + x8_gio_packw(state, ukernel, nr, kr, sr); +} + #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ BENCHMARK_CAPTURE_BGEMM(x8_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); +#define XNN_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ +BENCHMARK_CAPTURE_BGEMM(x8_gio_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr); + #include "x8-packw/x8-packw.h" #undef XNN_UKERNEL diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 324e02918bf..2b5b436296e 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -605,10 +605,16 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u2.c src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u3.c + src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c src/qs8-packw/gen/qs8-packw-x8c4-gemm-goi-scalar.c + src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-scalar.c + src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c src/qs8-packw/gen/qs8-packw-x16c4-gemm-goi-scalar.c + src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c + src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c src/qs8-packw/gen/qs8-packw-x32c4-gemm-goi-scalar.c + src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l1c1s1r-minmax-fp32-scalar-fmagic.c src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l1c1s1r-minmax-fp32-scalar-imagic.c @@ -689,7 +695,9 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-fmagic.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-imagic.c src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-lrintf.c + src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-scalar.c + src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c src/qs8-rsum/gen/qs8-rsum-scalar-u1.c src/qs8-rsum/gen/qs8-rsum-scalar-u2.c src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c @@ -828,6 +836,7 @@ SET(NON_PROD_SCALAR_MICROKERNEL_SRCS src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-u4.c src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-u4.c src/x8-packw/gen/x8-packw-x8-gemm-goi-scalar-u4.c + src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u4.c src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u4.c src/x8-transposec/gen/x8-transposec-1x2-scalar-int.c diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index 87338621bf1..dfc35aaf760 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -602,10 +602,16 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-dwconv/gen/qs8-dwconv-25p4c-minmax-fp32-scalar-lrintf.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u2.c", "src/qs8-f32-vcvt/gen/qs8-f32-vcvt-scalar-u3.c", + "src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c", "src/qs8-packw/gen/qs8-packw-x8c4-gemm-goi-scalar.c", + "src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c", "src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-scalar.c", + "src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c", "src/qs8-packw/gen/qs8-packw-x16c4-gemm-goi-scalar.c", + "src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c", + "src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c", "src/qs8-packw/gen/qs8-packw-x32c4-gemm-goi-scalar.c", + "src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-4p2c-minmax-fp32-scalar-imagic.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l1c1s1r-minmax-fp32-scalar-fmagic.c", "src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-5f5m5l1c1s1r-minmax-fp32-scalar-imagic.c", @@ -686,7 +692,9 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-fmagic.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-imagic.c", "src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-4x4-minmax-fp32-scalar-lrintf.c", + "src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c", "src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-scalar.c", + "src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c", "src/qs8-rsum/gen/qs8-rsum-scalar-u1.c", "src/qs8-rsum/gen/qs8-rsum-scalar-u2.c", "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c", @@ -825,6 +833,7 @@ NON_PROD_SCALAR_MICROKERNEL_SRCS = [ "src/x8-packw/gen/x8-packw-x2-gemm-goi-scalar-u4.c", "src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-u4.c", "src/x8-packw/gen/x8-packw-x8-gemm-goi-scalar-u4.c", + "src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c", "src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u4.c", "src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u4.c", "src/x8-transposec/gen/x8-transposec-1x2-scalar-int.c", diff --git a/scripts/generate-x8-packw.sh b/scripts/generate-x8-packw.sh index 2b943bb867d..1ca3dc7a78c 100755 --- a/scripts/generate-x8-packw.sh +++ b/scripts/generate-x8-packw.sh @@ -31,6 +31,20 @@ tools/xngen src/x8-packw/kr-scalar.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=0 tools/xngen src/x8-packw/kr-scalar.c.in -D NR=8 -D KR=8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-goi-scalar.c & tools/xngen src/x8-packw/kr-scalar.c.in -D NR=16 -D KR=8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-goi-scalar.c & +### GIO packing +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=8 -D KR=8 -D DATATYPE=X8 -D TYPE=int8_t -D IZP=0 -o src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c & + +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=8 -D KR=4 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c & +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=16 -D KR=4 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c & +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=32 -D KR=4 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c & +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=64 -D KR=4 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c & + +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=8 -D KR=8 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c & +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=16 -D KR=8 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -o src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c & + +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=8 -D KR=8 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c & +tools/xngen src/x8-packw/kr-gio-scalar.c.in -D NR=16 -D KR=8 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=128 -o src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c & + ### AVXVNNI micro-kernels ### C8 packing tools/xngen src/x8-packw/kr-avxvnni.c.in -D NR=8 -D KR=8 -D DATATYPE=QS8 -D TYPE=int8_t -D IZP=0 -D AVX=2 -D PREFETCH=0 -o src/qs8-packw/gen/qs8-packw-x8c8-gemm-goi-avxvnni.c & diff --git a/src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c new file mode 100644 index 00000000000..ad62824aea1 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x16c4-gemm-gio-scalar.c @@ -0,0 +1,1215 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x16c4__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 16); + assert(kr == 4); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 16 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 16; n -= 16) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + ((int32_t*) out)[8] = b[8]; + ((int32_t*) out)[9] = b[9]; + ((int32_t*) out)[10] = b[10]; + ((int32_t*) out)[11] = b[11]; + ((int32_t*) out)[12] = b[12]; + ((int32_t*) out)[13] = b[13]; + ((int32_t*) out)[14] = b[14]; + ((int32_t*) out)[15] = b[15]; + b += 16; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + ((int32_t*) out)[8] = 0; + ((int32_t*) out)[9] = 0; + ((int32_t*) out)[10] = 0; + ((int32_t*) out)[11] = 0; + ((int32_t*) out)[12] = 0; + ((int32_t*) out)[13] = 0; + ((int32_t*) out)[14] = 0; + ((int32_t*) out)[15] = 0; + } + out += 16 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + + // KC main loop multiple of 16x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + out[60] = v0x15; + out[61] = v1x15; + out[62] = v2x15; + out[63] = v3x15; + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 64; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[60] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[61] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[62] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[63] = v3x15; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 16; + } + + // NC remainder (1..15) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (16 - n) * sizeof(int32_t); + + // NR remainder has less than 16 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + + // KC main loop multiple of 16x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + } + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 64; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c new file mode 100644 index 00000000000..daca0a51041 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x16c8-gemm-gio-scalar.c @@ -0,0 +1,2231 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x16c8__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 16); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 16 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 16; n -= 16) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + ((int32_t*) out)[8] = b[8]; + ((int32_t*) out)[9] = b[9]; + ((int32_t*) out)[10] = b[10]; + ((int32_t*) out)[11] = b[11]; + ((int32_t*) out)[12] = b[12]; + ((int32_t*) out)[13] = b[13]; + ((int32_t*) out)[14] = b[14]; + ((int32_t*) out)[15] = b[15]; + b += 16; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + ((int32_t*) out)[8] = 0; + ((int32_t*) out)[9] = 0; + ((int32_t*) out)[10] = 0; + ((int32_t*) out)[11] = 0; + ((int32_t*) out)[12] = 0; + ((int32_t*) out)[13] = 0; + ((int32_t*) out)[14] = 0; + ((int32_t*) out)[15] = 0; + } + out += 16 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + + // KC main loop multiple of 16x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + const int8_t v4x8 = w4[8]; + const int8_t v5x8 = w5[8]; + const int8_t v6x8 = w6[8]; + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + ksum8 += (uint32_t) v4x8; + ksum8 += (uint32_t) v5x8; + ksum8 += (uint32_t) v6x8; + ksum8 += (uint32_t) v7x8; + out[64] = v0x8; + out[65] = v1x8; + out[66] = v2x8; + out[67] = v3x8; + out[68] = v4x8; + out[69] = v5x8; + out[70] = v6x8; + out[71] = v7x8; + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + const int8_t v4x9 = w4[9]; + const int8_t v5x9 = w5[9]; + const int8_t v6x9 = w6[9]; + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + ksum9 += (uint32_t) v4x9; + ksum9 += (uint32_t) v5x9; + ksum9 += (uint32_t) v6x9; + ksum9 += (uint32_t) v7x9; + out[72] = v0x9; + out[73] = v1x9; + out[74] = v2x9; + out[75] = v3x9; + out[76] = v4x9; + out[77] = v5x9; + out[78] = v6x9; + out[79] = v7x9; + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + const int8_t v4x10 = w4[10]; + const int8_t v5x10 = w5[10]; + const int8_t v6x10 = w6[10]; + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + ksum10 += (uint32_t) v4x10; + ksum10 += (uint32_t) v5x10; + ksum10 += (uint32_t) v6x10; + ksum10 += (uint32_t) v7x10; + out[80] = v0x10; + out[81] = v1x10; + out[82] = v2x10; + out[83] = v3x10; + out[84] = v4x10; + out[85] = v5x10; + out[86] = v6x10; + out[87] = v7x10; + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + const int8_t v4x11 = w4[11]; + const int8_t v5x11 = w5[11]; + const int8_t v6x11 = w6[11]; + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + ksum11 += (uint32_t) v4x11; + ksum11 += (uint32_t) v5x11; + ksum11 += (uint32_t) v6x11; + ksum11 += (uint32_t) v7x11; + out[88] = v0x11; + out[89] = v1x11; + out[90] = v2x11; + out[91] = v3x11; + out[92] = v4x11; + out[93] = v5x11; + out[94] = v6x11; + out[95] = v7x11; + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + const int8_t v4x12 = w4[12]; + const int8_t v5x12 = w5[12]; + const int8_t v6x12 = w6[12]; + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + ksum12 += (uint32_t) v4x12; + ksum12 += (uint32_t) v5x12; + ksum12 += (uint32_t) v6x12; + ksum12 += (uint32_t) v7x12; + out[96] = v0x12; + out[97] = v1x12; + out[98] = v2x12; + out[99] = v3x12; + out[100] = v4x12; + out[101] = v5x12; + out[102] = v6x12; + out[103] = v7x12; + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + const int8_t v4x13 = w4[13]; + const int8_t v5x13 = w5[13]; + const int8_t v6x13 = w6[13]; + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + ksum13 += (uint32_t) v4x13; + ksum13 += (uint32_t) v5x13; + ksum13 += (uint32_t) v6x13; + ksum13 += (uint32_t) v7x13; + out[104] = v0x13; + out[105] = v1x13; + out[106] = v2x13; + out[107] = v3x13; + out[108] = v4x13; + out[109] = v5x13; + out[110] = v6x13; + out[111] = v7x13; + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + const int8_t v4x14 = w4[14]; + const int8_t v5x14 = w5[14]; + const int8_t v6x14 = w6[14]; + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + ksum14 += (uint32_t) v4x14; + ksum14 += (uint32_t) v5x14; + ksum14 += (uint32_t) v6x14; + ksum14 += (uint32_t) v7x14; + out[112] = v0x14; + out[113] = v1x14; + out[114] = v2x14; + out[115] = v3x14; + out[116] = v4x14; + out[117] = v5x14; + out[118] = v6x14; + out[119] = v7x14; + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + const int8_t v4x15 = w4[15]; + const int8_t v5x15 = w5[15]; + const int8_t v6x15 = w6[15]; + const int8_t v7x15 = w7[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + ksum15 += (uint32_t) v4x15; + ksum15 += (uint32_t) v5x15; + ksum15 += (uint32_t) v6x15; + ksum15 += (uint32_t) v7x15; + out[120] = v0x15; + out[121] = v1x15; + out[122] = v2x15; + out[123] = v3x15; + out[124] = v4x15; + out[125] = v5x15; + out[126] = v6x15; + out[127] = v7x15; + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 128; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[64] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[65] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[66] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[67] = v3x8; + } + if (4 < k) { + const int8_t v4x8 = w4[8]; + ksum8 += (uint32_t) v4x8; + out[68] = v4x8; + } + if (5 < k) { + const int8_t v5x8 = w5[8]; + ksum8 += (uint32_t) v5x8; + out[69] = v5x8; + } + if (6 < k) { + const int8_t v6x8 = w6[8]; + ksum8 += (uint32_t) v6x8; + out[70] = v6x8; + } + if (7 < k) { + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v7x8; + out[71] = v7x8; + } + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[72] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[73] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[74] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[75] = v3x9; + } + if (4 < k) { + const int8_t v4x9 = w4[9]; + ksum9 += (uint32_t) v4x9; + out[76] = v4x9; + } + if (5 < k) { + const int8_t v5x9 = w5[9]; + ksum9 += (uint32_t) v5x9; + out[77] = v5x9; + } + if (6 < k) { + const int8_t v6x9 = w6[9]; + ksum9 += (uint32_t) v6x9; + out[78] = v6x9; + } + if (7 < k) { + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v7x9; + out[79] = v7x9; + } + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[80] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[81] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[82] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[83] = v3x10; + } + if (4 < k) { + const int8_t v4x10 = w4[10]; + ksum10 += (uint32_t) v4x10; + out[84] = v4x10; + } + if (5 < k) { + const int8_t v5x10 = w5[10]; + ksum10 += (uint32_t) v5x10; + out[85] = v5x10; + } + if (6 < k) { + const int8_t v6x10 = w6[10]; + ksum10 += (uint32_t) v6x10; + out[86] = v6x10; + } + if (7 < k) { + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v7x10; + out[87] = v7x10; + } + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[88] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[89] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[90] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[91] = v3x11; + } + if (4 < k) { + const int8_t v4x11 = w4[11]; + ksum11 += (uint32_t) v4x11; + out[92] = v4x11; + } + if (5 < k) { + const int8_t v5x11 = w5[11]; + ksum11 += (uint32_t) v5x11; + out[93] = v5x11; + } + if (6 < k) { + const int8_t v6x11 = w6[11]; + ksum11 += (uint32_t) v6x11; + out[94] = v6x11; + } + if (7 < k) { + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v7x11; + out[95] = v7x11; + } + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[96] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[97] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[98] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[99] = v3x12; + } + if (4 < k) { + const int8_t v4x12 = w4[12]; + ksum12 += (uint32_t) v4x12; + out[100] = v4x12; + } + if (5 < k) { + const int8_t v5x12 = w5[12]; + ksum12 += (uint32_t) v5x12; + out[101] = v5x12; + } + if (6 < k) { + const int8_t v6x12 = w6[12]; + ksum12 += (uint32_t) v6x12; + out[102] = v6x12; + } + if (7 < k) { + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v7x12; + out[103] = v7x12; + } + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[104] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[105] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[106] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[107] = v3x13; + } + if (4 < k) { + const int8_t v4x13 = w4[13]; + ksum13 += (uint32_t) v4x13; + out[108] = v4x13; + } + if (5 < k) { + const int8_t v5x13 = w5[13]; + ksum13 += (uint32_t) v5x13; + out[109] = v5x13; + } + if (6 < k) { + const int8_t v6x13 = w6[13]; + ksum13 += (uint32_t) v6x13; + out[110] = v6x13; + } + if (7 < k) { + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v7x13; + out[111] = v7x13; + } + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[112] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[113] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[114] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[115] = v3x14; + } + if (4 < k) { + const int8_t v4x14 = w4[14]; + ksum14 += (uint32_t) v4x14; + out[116] = v4x14; + } + if (5 < k) { + const int8_t v5x14 = w5[14]; + ksum14 += (uint32_t) v5x14; + out[117] = v5x14; + } + if (6 < k) { + const int8_t v6x14 = w6[14]; + ksum14 += (uint32_t) v6x14; + out[118] = v6x14; + } + if (7 < k) { + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v7x14; + out[119] = v7x14; + } + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[120] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[121] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[122] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[123] = v3x15; + } + if (4 < k) { + const int8_t v4x15 = w4[15]; + ksum15 += (uint32_t) v4x15; + out[124] = v4x15; + } + if (5 < k) { + const int8_t v5x15 = w5[15]; + ksum15 += (uint32_t) v5x15; + out[125] = v5x15; + } + if (6 < k) { + const int8_t v6x15 = w6[15]; + ksum15 += (uint32_t) v6x15; + out[126] = v6x15; + } + if (7 < k) { + const int8_t v7x15 = w7[15]; + ksum15 += (uint32_t) v7x15; + out[127] = v7x15; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 16; + } + + // NC remainder (1..15) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (16 - n) * sizeof(int32_t); + + // NR remainder has less than 16 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + + // KC main loop multiple of 16x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + const int8_t v4x8 = w4[8]; + const int8_t v5x8 = w5[8]; + const int8_t v6x8 = w6[8]; + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + ksum8 += (uint32_t) v4x8; + ksum8 += (uint32_t) v5x8; + ksum8 += (uint32_t) v6x8; + ksum8 += (uint32_t) v7x8; + out[64] = v0x8; + out[65] = v1x8; + out[66] = v2x8; + out[67] = v3x8; + out[68] = v4x8; + out[69] = v5x8; + out[70] = v6x8; + out[71] = v7x8; + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + const int8_t v4x9 = w4[9]; + const int8_t v5x9 = w5[9]; + const int8_t v6x9 = w6[9]; + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + ksum9 += (uint32_t) v4x9; + ksum9 += (uint32_t) v5x9; + ksum9 += (uint32_t) v6x9; + ksum9 += (uint32_t) v7x9; + out[72] = v0x9; + out[73] = v1x9; + out[74] = v2x9; + out[75] = v3x9; + out[76] = v4x9; + out[77] = v5x9; + out[78] = v6x9; + out[79] = v7x9; + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + const int8_t v4x10 = w4[10]; + const int8_t v5x10 = w5[10]; + const int8_t v6x10 = w6[10]; + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + ksum10 += (uint32_t) v4x10; + ksum10 += (uint32_t) v5x10; + ksum10 += (uint32_t) v6x10; + ksum10 += (uint32_t) v7x10; + out[80] = v0x10; + out[81] = v1x10; + out[82] = v2x10; + out[83] = v3x10; + out[84] = v4x10; + out[85] = v5x10; + out[86] = v6x10; + out[87] = v7x10; + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + const int8_t v4x11 = w4[11]; + const int8_t v5x11 = w5[11]; + const int8_t v6x11 = w6[11]; + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + ksum11 += (uint32_t) v4x11; + ksum11 += (uint32_t) v5x11; + ksum11 += (uint32_t) v6x11; + ksum11 += (uint32_t) v7x11; + out[88] = v0x11; + out[89] = v1x11; + out[90] = v2x11; + out[91] = v3x11; + out[92] = v4x11; + out[93] = v5x11; + out[94] = v6x11; + out[95] = v7x11; + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + const int8_t v4x12 = w4[12]; + const int8_t v5x12 = w5[12]; + const int8_t v6x12 = w6[12]; + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + ksum12 += (uint32_t) v4x12; + ksum12 += (uint32_t) v5x12; + ksum12 += (uint32_t) v6x12; + ksum12 += (uint32_t) v7x12; + out[96] = v0x12; + out[97] = v1x12; + out[98] = v2x12; + out[99] = v3x12; + out[100] = v4x12; + out[101] = v5x12; + out[102] = v6x12; + out[103] = v7x12; + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + const int8_t v4x13 = w4[13]; + const int8_t v5x13 = w5[13]; + const int8_t v6x13 = w6[13]; + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + ksum13 += (uint32_t) v4x13; + ksum13 += (uint32_t) v5x13; + ksum13 += (uint32_t) v6x13; + ksum13 += (uint32_t) v7x13; + out[104] = v0x13; + out[105] = v1x13; + out[106] = v2x13; + out[107] = v3x13; + out[108] = v4x13; + out[109] = v5x13; + out[110] = v6x13; + out[111] = v7x13; + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + const int8_t v4x14 = w4[14]; + const int8_t v5x14 = w5[14]; + const int8_t v6x14 = w6[14]; + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + ksum14 += (uint32_t) v4x14; + ksum14 += (uint32_t) v5x14; + ksum14 += (uint32_t) v6x14; + ksum14 += (uint32_t) v7x14; + out[112] = v0x14; + out[113] = v1x14; + out[114] = v2x14; + out[115] = v3x14; + out[116] = v4x14; + out[117] = v5x14; + out[118] = v6x14; + out[119] = v7x14; + } + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 128; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[64] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[65] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[66] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[67] = v3x8; + } + if (4 < k) { + const int8_t v4x8 = w4[8]; + ksum8 += (uint32_t) v4x8; + out[68] = v4x8; + } + if (5 < k) { + const int8_t v5x8 = w5[8]; + ksum8 += (uint32_t) v5x8; + out[69] = v5x8; + } + if (6 < k) { + const int8_t v6x8 = w6[8]; + ksum8 += (uint32_t) v6x8; + out[70] = v6x8; + } + if (7 < k) { + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v7x8; + out[71] = v7x8; + } + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[72] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[73] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[74] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[75] = v3x9; + } + if (4 < k) { + const int8_t v4x9 = w4[9]; + ksum9 += (uint32_t) v4x9; + out[76] = v4x9; + } + if (5 < k) { + const int8_t v5x9 = w5[9]; + ksum9 += (uint32_t) v5x9; + out[77] = v5x9; + } + if (6 < k) { + const int8_t v6x9 = w6[9]; + ksum9 += (uint32_t) v6x9; + out[78] = v6x9; + } + if (7 < k) { + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v7x9; + out[79] = v7x9; + } + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[80] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[81] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[82] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[83] = v3x10; + } + if (4 < k) { + const int8_t v4x10 = w4[10]; + ksum10 += (uint32_t) v4x10; + out[84] = v4x10; + } + if (5 < k) { + const int8_t v5x10 = w5[10]; + ksum10 += (uint32_t) v5x10; + out[85] = v5x10; + } + if (6 < k) { + const int8_t v6x10 = w6[10]; + ksum10 += (uint32_t) v6x10; + out[86] = v6x10; + } + if (7 < k) { + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v7x10; + out[87] = v7x10; + } + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[88] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[89] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[90] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[91] = v3x11; + } + if (4 < k) { + const int8_t v4x11 = w4[11]; + ksum11 += (uint32_t) v4x11; + out[92] = v4x11; + } + if (5 < k) { + const int8_t v5x11 = w5[11]; + ksum11 += (uint32_t) v5x11; + out[93] = v5x11; + } + if (6 < k) { + const int8_t v6x11 = w6[11]; + ksum11 += (uint32_t) v6x11; + out[94] = v6x11; + } + if (7 < k) { + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v7x11; + out[95] = v7x11; + } + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[96] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[97] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[98] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[99] = v3x12; + } + if (4 < k) { + const int8_t v4x12 = w4[12]; + ksum12 += (uint32_t) v4x12; + out[100] = v4x12; + } + if (5 < k) { + const int8_t v5x12 = w5[12]; + ksum12 += (uint32_t) v5x12; + out[101] = v5x12; + } + if (6 < k) { + const int8_t v6x12 = w6[12]; + ksum12 += (uint32_t) v6x12; + out[102] = v6x12; + } + if (7 < k) { + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v7x12; + out[103] = v7x12; + } + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[104] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[105] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[106] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[107] = v3x13; + } + if (4 < k) { + const int8_t v4x13 = w4[13]; + ksum13 += (uint32_t) v4x13; + out[108] = v4x13; + } + if (5 < k) { + const int8_t v5x13 = w5[13]; + ksum13 += (uint32_t) v5x13; + out[109] = v5x13; + } + if (6 < k) { + const int8_t v6x13 = w6[13]; + ksum13 += (uint32_t) v6x13; + out[110] = v6x13; + } + if (7 < k) { + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v7x13; + out[111] = v7x13; + } + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[112] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[113] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[114] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[115] = v3x14; + } + if (4 < k) { + const int8_t v4x14 = w4[14]; + ksum14 += (uint32_t) v4x14; + out[116] = v4x14; + } + if (5 < k) { + const int8_t v5x14 = w5[14]; + ksum14 += (uint32_t) v5x14; + out[117] = v5x14; + } + if (6 < k) { + const int8_t v6x14 = w6[14]; + ksum14 += (uint32_t) v6x14; + out[118] = v6x14; + } + if (7 < k) { + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v7x14; + out[119] = v7x14; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c new file mode 100644 index 00000000000..f1d2fbe7b72 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x32c4-gemm-gio-scalar.c @@ -0,0 +1,2335 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x32c4__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 32); + assert(kr == 4); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 32 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 32; n -= 32) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + ((int32_t*) out)[8] = b[8]; + ((int32_t*) out)[9] = b[9]; + ((int32_t*) out)[10] = b[10]; + ((int32_t*) out)[11] = b[11]; + ((int32_t*) out)[12] = b[12]; + ((int32_t*) out)[13] = b[13]; + ((int32_t*) out)[14] = b[14]; + ((int32_t*) out)[15] = b[15]; + ((int32_t*) out)[16] = b[16]; + ((int32_t*) out)[17] = b[17]; + ((int32_t*) out)[18] = b[18]; + ((int32_t*) out)[19] = b[19]; + ((int32_t*) out)[20] = b[20]; + ((int32_t*) out)[21] = b[21]; + ((int32_t*) out)[22] = b[22]; + ((int32_t*) out)[23] = b[23]; + ((int32_t*) out)[24] = b[24]; + ((int32_t*) out)[25] = b[25]; + ((int32_t*) out)[26] = b[26]; + ((int32_t*) out)[27] = b[27]; + ((int32_t*) out)[28] = b[28]; + ((int32_t*) out)[29] = b[29]; + ((int32_t*) out)[30] = b[30]; + ((int32_t*) out)[31] = b[31]; + b += 32; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + ((int32_t*) out)[8] = 0; + ((int32_t*) out)[9] = 0; + ((int32_t*) out)[10] = 0; + ((int32_t*) out)[11] = 0; + ((int32_t*) out)[12] = 0; + ((int32_t*) out)[13] = 0; + ((int32_t*) out)[14] = 0; + ((int32_t*) out)[15] = 0; + ((int32_t*) out)[16] = 0; + ((int32_t*) out)[17] = 0; + ((int32_t*) out)[18] = 0; + ((int32_t*) out)[19] = 0; + ((int32_t*) out)[20] = 0; + ((int32_t*) out)[21] = 0; + ((int32_t*) out)[22] = 0; + ((int32_t*) out)[23] = 0; + ((int32_t*) out)[24] = 0; + ((int32_t*) out)[25] = 0; + ((int32_t*) out)[26] = 0; + ((int32_t*) out)[27] = 0; + ((int32_t*) out)[28] = 0; + ((int32_t*) out)[29] = 0; + ((int32_t*) out)[30] = 0; + ((int32_t*) out)[31] = 0; + } + out += 32 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + uint32_t ksum16 = 0; + uint32_t ksum17 = 0; + uint32_t ksum18 = 0; + uint32_t ksum19 = 0; + uint32_t ksum20 = 0; + uint32_t ksum21 = 0; + uint32_t ksum22 = 0; + uint32_t ksum23 = 0; + uint32_t ksum24 = 0; + uint32_t ksum25 = 0; + uint32_t ksum26 = 0; + uint32_t ksum27 = 0; + uint32_t ksum28 = 0; + uint32_t ksum29 = 0; + uint32_t ksum30 = 0; + uint32_t ksum31 = 0; + + // KC main loop multiple of 32x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + out[60] = v0x15; + out[61] = v1x15; + out[62] = v2x15; + out[63] = v3x15; + const int8_t v0x16 = w0[16]; + const int8_t v1x16 = w1[16]; + const int8_t v2x16 = w2[16]; + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v0x16; + ksum16 += (uint32_t) v1x16; + ksum16 += (uint32_t) v2x16; + ksum16 += (uint32_t) v3x16; + out[64] = v0x16; + out[65] = v1x16; + out[66] = v2x16; + out[67] = v3x16; + const int8_t v0x17 = w0[17]; + const int8_t v1x17 = w1[17]; + const int8_t v2x17 = w2[17]; + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v0x17; + ksum17 += (uint32_t) v1x17; + ksum17 += (uint32_t) v2x17; + ksum17 += (uint32_t) v3x17; + out[68] = v0x17; + out[69] = v1x17; + out[70] = v2x17; + out[71] = v3x17; + const int8_t v0x18 = w0[18]; + const int8_t v1x18 = w1[18]; + const int8_t v2x18 = w2[18]; + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v0x18; + ksum18 += (uint32_t) v1x18; + ksum18 += (uint32_t) v2x18; + ksum18 += (uint32_t) v3x18; + out[72] = v0x18; + out[73] = v1x18; + out[74] = v2x18; + out[75] = v3x18; + const int8_t v0x19 = w0[19]; + const int8_t v1x19 = w1[19]; + const int8_t v2x19 = w2[19]; + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v0x19; + ksum19 += (uint32_t) v1x19; + ksum19 += (uint32_t) v2x19; + ksum19 += (uint32_t) v3x19; + out[76] = v0x19; + out[77] = v1x19; + out[78] = v2x19; + out[79] = v3x19; + const int8_t v0x20 = w0[20]; + const int8_t v1x20 = w1[20]; + const int8_t v2x20 = w2[20]; + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v0x20; + ksum20 += (uint32_t) v1x20; + ksum20 += (uint32_t) v2x20; + ksum20 += (uint32_t) v3x20; + out[80] = v0x20; + out[81] = v1x20; + out[82] = v2x20; + out[83] = v3x20; + const int8_t v0x21 = w0[21]; + const int8_t v1x21 = w1[21]; + const int8_t v2x21 = w2[21]; + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v0x21; + ksum21 += (uint32_t) v1x21; + ksum21 += (uint32_t) v2x21; + ksum21 += (uint32_t) v3x21; + out[84] = v0x21; + out[85] = v1x21; + out[86] = v2x21; + out[87] = v3x21; + const int8_t v0x22 = w0[22]; + const int8_t v1x22 = w1[22]; + const int8_t v2x22 = w2[22]; + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v0x22; + ksum22 += (uint32_t) v1x22; + ksum22 += (uint32_t) v2x22; + ksum22 += (uint32_t) v3x22; + out[88] = v0x22; + out[89] = v1x22; + out[90] = v2x22; + out[91] = v3x22; + const int8_t v0x23 = w0[23]; + const int8_t v1x23 = w1[23]; + const int8_t v2x23 = w2[23]; + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v0x23; + ksum23 += (uint32_t) v1x23; + ksum23 += (uint32_t) v2x23; + ksum23 += (uint32_t) v3x23; + out[92] = v0x23; + out[93] = v1x23; + out[94] = v2x23; + out[95] = v3x23; + const int8_t v0x24 = w0[24]; + const int8_t v1x24 = w1[24]; + const int8_t v2x24 = w2[24]; + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v0x24; + ksum24 += (uint32_t) v1x24; + ksum24 += (uint32_t) v2x24; + ksum24 += (uint32_t) v3x24; + out[96] = v0x24; + out[97] = v1x24; + out[98] = v2x24; + out[99] = v3x24; + const int8_t v0x25 = w0[25]; + const int8_t v1x25 = w1[25]; + const int8_t v2x25 = w2[25]; + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v0x25; + ksum25 += (uint32_t) v1x25; + ksum25 += (uint32_t) v2x25; + ksum25 += (uint32_t) v3x25; + out[100] = v0x25; + out[101] = v1x25; + out[102] = v2x25; + out[103] = v3x25; + const int8_t v0x26 = w0[26]; + const int8_t v1x26 = w1[26]; + const int8_t v2x26 = w2[26]; + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v0x26; + ksum26 += (uint32_t) v1x26; + ksum26 += (uint32_t) v2x26; + ksum26 += (uint32_t) v3x26; + out[104] = v0x26; + out[105] = v1x26; + out[106] = v2x26; + out[107] = v3x26; + const int8_t v0x27 = w0[27]; + const int8_t v1x27 = w1[27]; + const int8_t v2x27 = w2[27]; + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v0x27; + ksum27 += (uint32_t) v1x27; + ksum27 += (uint32_t) v2x27; + ksum27 += (uint32_t) v3x27; + out[108] = v0x27; + out[109] = v1x27; + out[110] = v2x27; + out[111] = v3x27; + const int8_t v0x28 = w0[28]; + const int8_t v1x28 = w1[28]; + const int8_t v2x28 = w2[28]; + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v0x28; + ksum28 += (uint32_t) v1x28; + ksum28 += (uint32_t) v2x28; + ksum28 += (uint32_t) v3x28; + out[112] = v0x28; + out[113] = v1x28; + out[114] = v2x28; + out[115] = v3x28; + const int8_t v0x29 = w0[29]; + const int8_t v1x29 = w1[29]; + const int8_t v2x29 = w2[29]; + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v0x29; + ksum29 += (uint32_t) v1x29; + ksum29 += (uint32_t) v2x29; + ksum29 += (uint32_t) v3x29; + out[116] = v0x29; + out[117] = v1x29; + out[118] = v2x29; + out[119] = v3x29; + const int8_t v0x30 = w0[30]; + const int8_t v1x30 = w1[30]; + const int8_t v2x30 = w2[30]; + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v0x30; + ksum30 += (uint32_t) v1x30; + ksum30 += (uint32_t) v2x30; + ksum30 += (uint32_t) v3x30; + out[120] = v0x30; + out[121] = v1x30; + out[122] = v2x30; + out[123] = v3x30; + const int8_t v0x31 = w0[31]; + const int8_t v1x31 = w1[31]; + const int8_t v2x31 = w2[31]; + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v0x31; + ksum31 += (uint32_t) v1x31; + ksum31 += (uint32_t) v2x31; + ksum31 += (uint32_t) v3x31; + out[124] = v0x31; + out[125] = v1x31; + out[126] = v2x31; + out[127] = v3x31; + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 128; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[60] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[61] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[62] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[63] = v3x15; + } + const int8_t v0x16 = w0[16]; + ksum16 += (uint32_t) v0x16; + out[64] = v0x16; + if (1 < k) { + const int8_t v1x16 = w1[16]; + ksum16 += (uint32_t) v1x16; + out[65] = v1x16; + } + if (2 < k) { + const int8_t v2x16 = w2[16]; + ksum16 += (uint32_t) v2x16; + out[66] = v2x16; + } + if (3 < k) { + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v3x16; + out[67] = v3x16; + } + const int8_t v0x17 = w0[17]; + ksum17 += (uint32_t) v0x17; + out[68] = v0x17; + if (1 < k) { + const int8_t v1x17 = w1[17]; + ksum17 += (uint32_t) v1x17; + out[69] = v1x17; + } + if (2 < k) { + const int8_t v2x17 = w2[17]; + ksum17 += (uint32_t) v2x17; + out[70] = v2x17; + } + if (3 < k) { + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v3x17; + out[71] = v3x17; + } + const int8_t v0x18 = w0[18]; + ksum18 += (uint32_t) v0x18; + out[72] = v0x18; + if (1 < k) { + const int8_t v1x18 = w1[18]; + ksum18 += (uint32_t) v1x18; + out[73] = v1x18; + } + if (2 < k) { + const int8_t v2x18 = w2[18]; + ksum18 += (uint32_t) v2x18; + out[74] = v2x18; + } + if (3 < k) { + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v3x18; + out[75] = v3x18; + } + const int8_t v0x19 = w0[19]; + ksum19 += (uint32_t) v0x19; + out[76] = v0x19; + if (1 < k) { + const int8_t v1x19 = w1[19]; + ksum19 += (uint32_t) v1x19; + out[77] = v1x19; + } + if (2 < k) { + const int8_t v2x19 = w2[19]; + ksum19 += (uint32_t) v2x19; + out[78] = v2x19; + } + if (3 < k) { + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v3x19; + out[79] = v3x19; + } + const int8_t v0x20 = w0[20]; + ksum20 += (uint32_t) v0x20; + out[80] = v0x20; + if (1 < k) { + const int8_t v1x20 = w1[20]; + ksum20 += (uint32_t) v1x20; + out[81] = v1x20; + } + if (2 < k) { + const int8_t v2x20 = w2[20]; + ksum20 += (uint32_t) v2x20; + out[82] = v2x20; + } + if (3 < k) { + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v3x20; + out[83] = v3x20; + } + const int8_t v0x21 = w0[21]; + ksum21 += (uint32_t) v0x21; + out[84] = v0x21; + if (1 < k) { + const int8_t v1x21 = w1[21]; + ksum21 += (uint32_t) v1x21; + out[85] = v1x21; + } + if (2 < k) { + const int8_t v2x21 = w2[21]; + ksum21 += (uint32_t) v2x21; + out[86] = v2x21; + } + if (3 < k) { + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v3x21; + out[87] = v3x21; + } + const int8_t v0x22 = w0[22]; + ksum22 += (uint32_t) v0x22; + out[88] = v0x22; + if (1 < k) { + const int8_t v1x22 = w1[22]; + ksum22 += (uint32_t) v1x22; + out[89] = v1x22; + } + if (2 < k) { + const int8_t v2x22 = w2[22]; + ksum22 += (uint32_t) v2x22; + out[90] = v2x22; + } + if (3 < k) { + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v3x22; + out[91] = v3x22; + } + const int8_t v0x23 = w0[23]; + ksum23 += (uint32_t) v0x23; + out[92] = v0x23; + if (1 < k) { + const int8_t v1x23 = w1[23]; + ksum23 += (uint32_t) v1x23; + out[93] = v1x23; + } + if (2 < k) { + const int8_t v2x23 = w2[23]; + ksum23 += (uint32_t) v2x23; + out[94] = v2x23; + } + if (3 < k) { + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v3x23; + out[95] = v3x23; + } + const int8_t v0x24 = w0[24]; + ksum24 += (uint32_t) v0x24; + out[96] = v0x24; + if (1 < k) { + const int8_t v1x24 = w1[24]; + ksum24 += (uint32_t) v1x24; + out[97] = v1x24; + } + if (2 < k) { + const int8_t v2x24 = w2[24]; + ksum24 += (uint32_t) v2x24; + out[98] = v2x24; + } + if (3 < k) { + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v3x24; + out[99] = v3x24; + } + const int8_t v0x25 = w0[25]; + ksum25 += (uint32_t) v0x25; + out[100] = v0x25; + if (1 < k) { + const int8_t v1x25 = w1[25]; + ksum25 += (uint32_t) v1x25; + out[101] = v1x25; + } + if (2 < k) { + const int8_t v2x25 = w2[25]; + ksum25 += (uint32_t) v2x25; + out[102] = v2x25; + } + if (3 < k) { + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v3x25; + out[103] = v3x25; + } + const int8_t v0x26 = w0[26]; + ksum26 += (uint32_t) v0x26; + out[104] = v0x26; + if (1 < k) { + const int8_t v1x26 = w1[26]; + ksum26 += (uint32_t) v1x26; + out[105] = v1x26; + } + if (2 < k) { + const int8_t v2x26 = w2[26]; + ksum26 += (uint32_t) v2x26; + out[106] = v2x26; + } + if (3 < k) { + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v3x26; + out[107] = v3x26; + } + const int8_t v0x27 = w0[27]; + ksum27 += (uint32_t) v0x27; + out[108] = v0x27; + if (1 < k) { + const int8_t v1x27 = w1[27]; + ksum27 += (uint32_t) v1x27; + out[109] = v1x27; + } + if (2 < k) { + const int8_t v2x27 = w2[27]; + ksum27 += (uint32_t) v2x27; + out[110] = v2x27; + } + if (3 < k) { + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v3x27; + out[111] = v3x27; + } + const int8_t v0x28 = w0[28]; + ksum28 += (uint32_t) v0x28; + out[112] = v0x28; + if (1 < k) { + const int8_t v1x28 = w1[28]; + ksum28 += (uint32_t) v1x28; + out[113] = v1x28; + } + if (2 < k) { + const int8_t v2x28 = w2[28]; + ksum28 += (uint32_t) v2x28; + out[114] = v2x28; + } + if (3 < k) { + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v3x28; + out[115] = v3x28; + } + const int8_t v0x29 = w0[29]; + ksum29 += (uint32_t) v0x29; + out[116] = v0x29; + if (1 < k) { + const int8_t v1x29 = w1[29]; + ksum29 += (uint32_t) v1x29; + out[117] = v1x29; + } + if (2 < k) { + const int8_t v2x29 = w2[29]; + ksum29 += (uint32_t) v2x29; + out[118] = v2x29; + } + if (3 < k) { + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v3x29; + out[119] = v3x29; + } + const int8_t v0x30 = w0[30]; + ksum30 += (uint32_t) v0x30; + out[120] = v0x30; + if (1 < k) { + const int8_t v1x30 = w1[30]; + ksum30 += (uint32_t) v1x30; + out[121] = v1x30; + } + if (2 < k) { + const int8_t v2x30 = w2[30]; + ksum30 += (uint32_t) v2x30; + out[122] = v2x30; + } + if (3 < k) { + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v3x30; + out[123] = v3x30; + } + const int8_t v0x31 = w0[31]; + ksum31 += (uint32_t) v0x31; + out[124] = v0x31; + if (1 < k) { + const int8_t v1x31 = w1[31]; + ksum31 += (uint32_t) v1x31; + out[125] = v1x31; + } + if (2 < k) { + const int8_t v2x31 = w2[31]; + ksum31 += (uint32_t) v2x31; + out[126] = v2x31; + } + if (3 < k) { + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v3x31; + out[127] = v3x31; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + packed_b[16] -= ksum16 * izp; + packed_b[17] -= ksum17 * izp; + packed_b[18] -= ksum18 * izp; + packed_b[19] -= ksum19 * izp; + packed_b[20] -= ksum20 * izp; + packed_b[21] -= ksum21 * izp; + packed_b[22] -= ksum22 * izp; + packed_b[23] -= ksum23 * izp; + packed_b[24] -= ksum24 * izp; + packed_b[25] -= ksum25 * izp; + packed_b[26] -= ksum26 * izp; + packed_b[27] -= ksum27 * izp; + packed_b[28] -= ksum28 * izp; + packed_b[29] -= ksum29 * izp; + packed_b[30] -= ksum30 * izp; + packed_b[31] -= ksum31 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 32; + } + + // NC remainder (1..31) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (32 - n) * sizeof(int32_t); + + // NR remainder has less than 32 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + uint32_t ksum16 = 0; + uint32_t ksum17 = 0; + uint32_t ksum18 = 0; + uint32_t ksum19 = 0; + uint32_t ksum20 = 0; + uint32_t ksum21 = 0; + uint32_t ksum22 = 0; + uint32_t ksum23 = 0; + uint32_t ksum24 = 0; + uint32_t ksum25 = 0; + uint32_t ksum26 = 0; + uint32_t ksum27 = 0; + uint32_t ksum28 = 0; + uint32_t ksum29 = 0; + uint32_t ksum30 = 0; + + // KC main loop multiple of 32x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + } + if (15 < n) { + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + out[60] = v0x15; + out[61] = v1x15; + out[62] = v2x15; + out[63] = v3x15; + } + if (16 < n) { + const int8_t v0x16 = w0[16]; + const int8_t v1x16 = w1[16]; + const int8_t v2x16 = w2[16]; + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v0x16; + ksum16 += (uint32_t) v1x16; + ksum16 += (uint32_t) v2x16; + ksum16 += (uint32_t) v3x16; + out[64] = v0x16; + out[65] = v1x16; + out[66] = v2x16; + out[67] = v3x16; + } + if (17 < n) { + const int8_t v0x17 = w0[17]; + const int8_t v1x17 = w1[17]; + const int8_t v2x17 = w2[17]; + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v0x17; + ksum17 += (uint32_t) v1x17; + ksum17 += (uint32_t) v2x17; + ksum17 += (uint32_t) v3x17; + out[68] = v0x17; + out[69] = v1x17; + out[70] = v2x17; + out[71] = v3x17; + } + if (18 < n) { + const int8_t v0x18 = w0[18]; + const int8_t v1x18 = w1[18]; + const int8_t v2x18 = w2[18]; + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v0x18; + ksum18 += (uint32_t) v1x18; + ksum18 += (uint32_t) v2x18; + ksum18 += (uint32_t) v3x18; + out[72] = v0x18; + out[73] = v1x18; + out[74] = v2x18; + out[75] = v3x18; + } + if (19 < n) { + const int8_t v0x19 = w0[19]; + const int8_t v1x19 = w1[19]; + const int8_t v2x19 = w2[19]; + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v0x19; + ksum19 += (uint32_t) v1x19; + ksum19 += (uint32_t) v2x19; + ksum19 += (uint32_t) v3x19; + out[76] = v0x19; + out[77] = v1x19; + out[78] = v2x19; + out[79] = v3x19; + } + if (20 < n) { + const int8_t v0x20 = w0[20]; + const int8_t v1x20 = w1[20]; + const int8_t v2x20 = w2[20]; + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v0x20; + ksum20 += (uint32_t) v1x20; + ksum20 += (uint32_t) v2x20; + ksum20 += (uint32_t) v3x20; + out[80] = v0x20; + out[81] = v1x20; + out[82] = v2x20; + out[83] = v3x20; + } + if (21 < n) { + const int8_t v0x21 = w0[21]; + const int8_t v1x21 = w1[21]; + const int8_t v2x21 = w2[21]; + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v0x21; + ksum21 += (uint32_t) v1x21; + ksum21 += (uint32_t) v2x21; + ksum21 += (uint32_t) v3x21; + out[84] = v0x21; + out[85] = v1x21; + out[86] = v2x21; + out[87] = v3x21; + } + if (22 < n) { + const int8_t v0x22 = w0[22]; + const int8_t v1x22 = w1[22]; + const int8_t v2x22 = w2[22]; + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v0x22; + ksum22 += (uint32_t) v1x22; + ksum22 += (uint32_t) v2x22; + ksum22 += (uint32_t) v3x22; + out[88] = v0x22; + out[89] = v1x22; + out[90] = v2x22; + out[91] = v3x22; + } + if (23 < n) { + const int8_t v0x23 = w0[23]; + const int8_t v1x23 = w1[23]; + const int8_t v2x23 = w2[23]; + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v0x23; + ksum23 += (uint32_t) v1x23; + ksum23 += (uint32_t) v2x23; + ksum23 += (uint32_t) v3x23; + out[92] = v0x23; + out[93] = v1x23; + out[94] = v2x23; + out[95] = v3x23; + } + if (24 < n) { + const int8_t v0x24 = w0[24]; + const int8_t v1x24 = w1[24]; + const int8_t v2x24 = w2[24]; + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v0x24; + ksum24 += (uint32_t) v1x24; + ksum24 += (uint32_t) v2x24; + ksum24 += (uint32_t) v3x24; + out[96] = v0x24; + out[97] = v1x24; + out[98] = v2x24; + out[99] = v3x24; + } + if (25 < n) { + const int8_t v0x25 = w0[25]; + const int8_t v1x25 = w1[25]; + const int8_t v2x25 = w2[25]; + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v0x25; + ksum25 += (uint32_t) v1x25; + ksum25 += (uint32_t) v2x25; + ksum25 += (uint32_t) v3x25; + out[100] = v0x25; + out[101] = v1x25; + out[102] = v2x25; + out[103] = v3x25; + } + if (26 < n) { + const int8_t v0x26 = w0[26]; + const int8_t v1x26 = w1[26]; + const int8_t v2x26 = w2[26]; + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v0x26; + ksum26 += (uint32_t) v1x26; + ksum26 += (uint32_t) v2x26; + ksum26 += (uint32_t) v3x26; + out[104] = v0x26; + out[105] = v1x26; + out[106] = v2x26; + out[107] = v3x26; + } + if (27 < n) { + const int8_t v0x27 = w0[27]; + const int8_t v1x27 = w1[27]; + const int8_t v2x27 = w2[27]; + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v0x27; + ksum27 += (uint32_t) v1x27; + ksum27 += (uint32_t) v2x27; + ksum27 += (uint32_t) v3x27; + out[108] = v0x27; + out[109] = v1x27; + out[110] = v2x27; + out[111] = v3x27; + } + if (28 < n) { + const int8_t v0x28 = w0[28]; + const int8_t v1x28 = w1[28]; + const int8_t v2x28 = w2[28]; + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v0x28; + ksum28 += (uint32_t) v1x28; + ksum28 += (uint32_t) v2x28; + ksum28 += (uint32_t) v3x28; + out[112] = v0x28; + out[113] = v1x28; + out[114] = v2x28; + out[115] = v3x28; + } + if (29 < n) { + const int8_t v0x29 = w0[29]; + const int8_t v1x29 = w1[29]; + const int8_t v2x29 = w2[29]; + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v0x29; + ksum29 += (uint32_t) v1x29; + ksum29 += (uint32_t) v2x29; + ksum29 += (uint32_t) v3x29; + out[116] = v0x29; + out[117] = v1x29; + out[118] = v2x29; + out[119] = v3x29; + } + if (30 < n) { + const int8_t v0x30 = w0[30]; + const int8_t v1x30 = w1[30]; + const int8_t v2x30 = w2[30]; + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v0x30; + ksum30 += (uint32_t) v1x30; + ksum30 += (uint32_t) v2x30; + ksum30 += (uint32_t) v3x30; + out[120] = v0x30; + out[121] = v1x30; + out[122] = v2x30; + out[123] = v3x30; + } + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 128; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + } + if (15 < n) { + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[60] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[61] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[62] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[63] = v3x15; + } + } + if (16 < n) { + const int8_t v0x16 = w0[16]; + ksum16 += (uint32_t) v0x16; + out[64] = v0x16; + if (1 < k) { + const int8_t v1x16 = w1[16]; + ksum16 += (uint32_t) v1x16; + out[65] = v1x16; + } + if (2 < k) { + const int8_t v2x16 = w2[16]; + ksum16 += (uint32_t) v2x16; + out[66] = v2x16; + } + if (3 < k) { + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v3x16; + out[67] = v3x16; + } + } + if (17 < n) { + const int8_t v0x17 = w0[17]; + ksum17 += (uint32_t) v0x17; + out[68] = v0x17; + if (1 < k) { + const int8_t v1x17 = w1[17]; + ksum17 += (uint32_t) v1x17; + out[69] = v1x17; + } + if (2 < k) { + const int8_t v2x17 = w2[17]; + ksum17 += (uint32_t) v2x17; + out[70] = v2x17; + } + if (3 < k) { + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v3x17; + out[71] = v3x17; + } + } + if (18 < n) { + const int8_t v0x18 = w0[18]; + ksum18 += (uint32_t) v0x18; + out[72] = v0x18; + if (1 < k) { + const int8_t v1x18 = w1[18]; + ksum18 += (uint32_t) v1x18; + out[73] = v1x18; + } + if (2 < k) { + const int8_t v2x18 = w2[18]; + ksum18 += (uint32_t) v2x18; + out[74] = v2x18; + } + if (3 < k) { + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v3x18; + out[75] = v3x18; + } + } + if (19 < n) { + const int8_t v0x19 = w0[19]; + ksum19 += (uint32_t) v0x19; + out[76] = v0x19; + if (1 < k) { + const int8_t v1x19 = w1[19]; + ksum19 += (uint32_t) v1x19; + out[77] = v1x19; + } + if (2 < k) { + const int8_t v2x19 = w2[19]; + ksum19 += (uint32_t) v2x19; + out[78] = v2x19; + } + if (3 < k) { + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v3x19; + out[79] = v3x19; + } + } + if (20 < n) { + const int8_t v0x20 = w0[20]; + ksum20 += (uint32_t) v0x20; + out[80] = v0x20; + if (1 < k) { + const int8_t v1x20 = w1[20]; + ksum20 += (uint32_t) v1x20; + out[81] = v1x20; + } + if (2 < k) { + const int8_t v2x20 = w2[20]; + ksum20 += (uint32_t) v2x20; + out[82] = v2x20; + } + if (3 < k) { + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v3x20; + out[83] = v3x20; + } + } + if (21 < n) { + const int8_t v0x21 = w0[21]; + ksum21 += (uint32_t) v0x21; + out[84] = v0x21; + if (1 < k) { + const int8_t v1x21 = w1[21]; + ksum21 += (uint32_t) v1x21; + out[85] = v1x21; + } + if (2 < k) { + const int8_t v2x21 = w2[21]; + ksum21 += (uint32_t) v2x21; + out[86] = v2x21; + } + if (3 < k) { + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v3x21; + out[87] = v3x21; + } + } + if (22 < n) { + const int8_t v0x22 = w0[22]; + ksum22 += (uint32_t) v0x22; + out[88] = v0x22; + if (1 < k) { + const int8_t v1x22 = w1[22]; + ksum22 += (uint32_t) v1x22; + out[89] = v1x22; + } + if (2 < k) { + const int8_t v2x22 = w2[22]; + ksum22 += (uint32_t) v2x22; + out[90] = v2x22; + } + if (3 < k) { + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v3x22; + out[91] = v3x22; + } + } + if (23 < n) { + const int8_t v0x23 = w0[23]; + ksum23 += (uint32_t) v0x23; + out[92] = v0x23; + if (1 < k) { + const int8_t v1x23 = w1[23]; + ksum23 += (uint32_t) v1x23; + out[93] = v1x23; + } + if (2 < k) { + const int8_t v2x23 = w2[23]; + ksum23 += (uint32_t) v2x23; + out[94] = v2x23; + } + if (3 < k) { + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v3x23; + out[95] = v3x23; + } + } + if (24 < n) { + const int8_t v0x24 = w0[24]; + ksum24 += (uint32_t) v0x24; + out[96] = v0x24; + if (1 < k) { + const int8_t v1x24 = w1[24]; + ksum24 += (uint32_t) v1x24; + out[97] = v1x24; + } + if (2 < k) { + const int8_t v2x24 = w2[24]; + ksum24 += (uint32_t) v2x24; + out[98] = v2x24; + } + if (3 < k) { + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v3x24; + out[99] = v3x24; + } + } + if (25 < n) { + const int8_t v0x25 = w0[25]; + ksum25 += (uint32_t) v0x25; + out[100] = v0x25; + if (1 < k) { + const int8_t v1x25 = w1[25]; + ksum25 += (uint32_t) v1x25; + out[101] = v1x25; + } + if (2 < k) { + const int8_t v2x25 = w2[25]; + ksum25 += (uint32_t) v2x25; + out[102] = v2x25; + } + if (3 < k) { + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v3x25; + out[103] = v3x25; + } + } + if (26 < n) { + const int8_t v0x26 = w0[26]; + ksum26 += (uint32_t) v0x26; + out[104] = v0x26; + if (1 < k) { + const int8_t v1x26 = w1[26]; + ksum26 += (uint32_t) v1x26; + out[105] = v1x26; + } + if (2 < k) { + const int8_t v2x26 = w2[26]; + ksum26 += (uint32_t) v2x26; + out[106] = v2x26; + } + if (3 < k) { + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v3x26; + out[107] = v3x26; + } + } + if (27 < n) { + const int8_t v0x27 = w0[27]; + ksum27 += (uint32_t) v0x27; + out[108] = v0x27; + if (1 < k) { + const int8_t v1x27 = w1[27]; + ksum27 += (uint32_t) v1x27; + out[109] = v1x27; + } + if (2 < k) { + const int8_t v2x27 = w2[27]; + ksum27 += (uint32_t) v2x27; + out[110] = v2x27; + } + if (3 < k) { + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v3x27; + out[111] = v3x27; + } + } + if (28 < n) { + const int8_t v0x28 = w0[28]; + ksum28 += (uint32_t) v0x28; + out[112] = v0x28; + if (1 < k) { + const int8_t v1x28 = w1[28]; + ksum28 += (uint32_t) v1x28; + out[113] = v1x28; + } + if (2 < k) { + const int8_t v2x28 = w2[28]; + ksum28 += (uint32_t) v2x28; + out[114] = v2x28; + } + if (3 < k) { + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v3x28; + out[115] = v3x28; + } + } + if (29 < n) { + const int8_t v0x29 = w0[29]; + ksum29 += (uint32_t) v0x29; + out[116] = v0x29; + if (1 < k) { + const int8_t v1x29 = w1[29]; + ksum29 += (uint32_t) v1x29; + out[117] = v1x29; + } + if (2 < k) { + const int8_t v2x29 = w2[29]; + ksum29 += (uint32_t) v2x29; + out[118] = v2x29; + } + if (3 < k) { + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v3x29; + out[119] = v3x29; + } + } + if (30 < n) { + const int8_t v0x30 = w0[30]; + ksum30 += (uint32_t) v0x30; + out[120] = v0x30; + if (1 < k) { + const int8_t v1x30 = w1[30]; + ksum30 += (uint32_t) v1x30; + out[121] = v1x30; + } + if (2 < k) { + const int8_t v2x30 = w2[30]; + ksum30 += (uint32_t) v2x30; + out[122] = v2x30; + } + if (3 < k) { + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v3x30; + out[123] = v3x30; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + packed_b[16] -= ksum16 * izp; + packed_b[17] -= ksum17 * izp; + packed_b[18] -= ksum18 * izp; + packed_b[19] -= ksum19 * izp; + packed_b[20] -= ksum20 * izp; + packed_b[21] -= ksum21 * izp; + packed_b[22] -= ksum22 * izp; + packed_b[23] -= ksum23 * izp; + packed_b[24] -= ksum24 * izp; + packed_b[25] -= ksum25 * izp; + packed_b[26] -= ksum26 * izp; + packed_b[27] -= ksum27 * izp; + packed_b[28] -= ksum28 * izp; + packed_b[29] -= ksum29 * izp; + packed_b[30] -= ksum30 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c new file mode 100644 index 00000000000..4412f91a795 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x64c4-gemm-gio-scalar.c @@ -0,0 +1,4575 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x64c4__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 64); + assert(kr == 4); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 64 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 64; n -= 64) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + ((int32_t*) out)[8] = b[8]; + ((int32_t*) out)[9] = b[9]; + ((int32_t*) out)[10] = b[10]; + ((int32_t*) out)[11] = b[11]; + ((int32_t*) out)[12] = b[12]; + ((int32_t*) out)[13] = b[13]; + ((int32_t*) out)[14] = b[14]; + ((int32_t*) out)[15] = b[15]; + ((int32_t*) out)[16] = b[16]; + ((int32_t*) out)[17] = b[17]; + ((int32_t*) out)[18] = b[18]; + ((int32_t*) out)[19] = b[19]; + ((int32_t*) out)[20] = b[20]; + ((int32_t*) out)[21] = b[21]; + ((int32_t*) out)[22] = b[22]; + ((int32_t*) out)[23] = b[23]; + ((int32_t*) out)[24] = b[24]; + ((int32_t*) out)[25] = b[25]; + ((int32_t*) out)[26] = b[26]; + ((int32_t*) out)[27] = b[27]; + ((int32_t*) out)[28] = b[28]; + ((int32_t*) out)[29] = b[29]; + ((int32_t*) out)[30] = b[30]; + ((int32_t*) out)[31] = b[31]; + ((int32_t*) out)[32] = b[32]; + ((int32_t*) out)[33] = b[33]; + ((int32_t*) out)[34] = b[34]; + ((int32_t*) out)[35] = b[35]; + ((int32_t*) out)[36] = b[36]; + ((int32_t*) out)[37] = b[37]; + ((int32_t*) out)[38] = b[38]; + ((int32_t*) out)[39] = b[39]; + ((int32_t*) out)[40] = b[40]; + ((int32_t*) out)[41] = b[41]; + ((int32_t*) out)[42] = b[42]; + ((int32_t*) out)[43] = b[43]; + ((int32_t*) out)[44] = b[44]; + ((int32_t*) out)[45] = b[45]; + ((int32_t*) out)[46] = b[46]; + ((int32_t*) out)[47] = b[47]; + ((int32_t*) out)[48] = b[48]; + ((int32_t*) out)[49] = b[49]; + ((int32_t*) out)[50] = b[50]; + ((int32_t*) out)[51] = b[51]; + ((int32_t*) out)[52] = b[52]; + ((int32_t*) out)[53] = b[53]; + ((int32_t*) out)[54] = b[54]; + ((int32_t*) out)[55] = b[55]; + ((int32_t*) out)[56] = b[56]; + ((int32_t*) out)[57] = b[57]; + ((int32_t*) out)[58] = b[58]; + ((int32_t*) out)[59] = b[59]; + ((int32_t*) out)[60] = b[60]; + ((int32_t*) out)[61] = b[61]; + ((int32_t*) out)[62] = b[62]; + ((int32_t*) out)[63] = b[63]; + b += 64; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + ((int32_t*) out)[8] = 0; + ((int32_t*) out)[9] = 0; + ((int32_t*) out)[10] = 0; + ((int32_t*) out)[11] = 0; + ((int32_t*) out)[12] = 0; + ((int32_t*) out)[13] = 0; + ((int32_t*) out)[14] = 0; + ((int32_t*) out)[15] = 0; + ((int32_t*) out)[16] = 0; + ((int32_t*) out)[17] = 0; + ((int32_t*) out)[18] = 0; + ((int32_t*) out)[19] = 0; + ((int32_t*) out)[20] = 0; + ((int32_t*) out)[21] = 0; + ((int32_t*) out)[22] = 0; + ((int32_t*) out)[23] = 0; + ((int32_t*) out)[24] = 0; + ((int32_t*) out)[25] = 0; + ((int32_t*) out)[26] = 0; + ((int32_t*) out)[27] = 0; + ((int32_t*) out)[28] = 0; + ((int32_t*) out)[29] = 0; + ((int32_t*) out)[30] = 0; + ((int32_t*) out)[31] = 0; + ((int32_t*) out)[32] = 0; + ((int32_t*) out)[33] = 0; + ((int32_t*) out)[34] = 0; + ((int32_t*) out)[35] = 0; + ((int32_t*) out)[36] = 0; + ((int32_t*) out)[37] = 0; + ((int32_t*) out)[38] = 0; + ((int32_t*) out)[39] = 0; + ((int32_t*) out)[40] = 0; + ((int32_t*) out)[41] = 0; + ((int32_t*) out)[42] = 0; + ((int32_t*) out)[43] = 0; + ((int32_t*) out)[44] = 0; + ((int32_t*) out)[45] = 0; + ((int32_t*) out)[46] = 0; + ((int32_t*) out)[47] = 0; + ((int32_t*) out)[48] = 0; + ((int32_t*) out)[49] = 0; + ((int32_t*) out)[50] = 0; + ((int32_t*) out)[51] = 0; + ((int32_t*) out)[52] = 0; + ((int32_t*) out)[53] = 0; + ((int32_t*) out)[54] = 0; + ((int32_t*) out)[55] = 0; + ((int32_t*) out)[56] = 0; + ((int32_t*) out)[57] = 0; + ((int32_t*) out)[58] = 0; + ((int32_t*) out)[59] = 0; + ((int32_t*) out)[60] = 0; + ((int32_t*) out)[61] = 0; + ((int32_t*) out)[62] = 0; + ((int32_t*) out)[63] = 0; + } + out += 64 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + uint32_t ksum16 = 0; + uint32_t ksum17 = 0; + uint32_t ksum18 = 0; + uint32_t ksum19 = 0; + uint32_t ksum20 = 0; + uint32_t ksum21 = 0; + uint32_t ksum22 = 0; + uint32_t ksum23 = 0; + uint32_t ksum24 = 0; + uint32_t ksum25 = 0; + uint32_t ksum26 = 0; + uint32_t ksum27 = 0; + uint32_t ksum28 = 0; + uint32_t ksum29 = 0; + uint32_t ksum30 = 0; + uint32_t ksum31 = 0; + uint32_t ksum32 = 0; + uint32_t ksum33 = 0; + uint32_t ksum34 = 0; + uint32_t ksum35 = 0; + uint32_t ksum36 = 0; + uint32_t ksum37 = 0; + uint32_t ksum38 = 0; + uint32_t ksum39 = 0; + uint32_t ksum40 = 0; + uint32_t ksum41 = 0; + uint32_t ksum42 = 0; + uint32_t ksum43 = 0; + uint32_t ksum44 = 0; + uint32_t ksum45 = 0; + uint32_t ksum46 = 0; + uint32_t ksum47 = 0; + uint32_t ksum48 = 0; + uint32_t ksum49 = 0; + uint32_t ksum50 = 0; + uint32_t ksum51 = 0; + uint32_t ksum52 = 0; + uint32_t ksum53 = 0; + uint32_t ksum54 = 0; + uint32_t ksum55 = 0; + uint32_t ksum56 = 0; + uint32_t ksum57 = 0; + uint32_t ksum58 = 0; + uint32_t ksum59 = 0; + uint32_t ksum60 = 0; + uint32_t ksum61 = 0; + uint32_t ksum62 = 0; + uint32_t ksum63 = 0; + + // KC main loop multiple of 64x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + out[60] = v0x15; + out[61] = v1x15; + out[62] = v2x15; + out[63] = v3x15; + const int8_t v0x16 = w0[16]; + const int8_t v1x16 = w1[16]; + const int8_t v2x16 = w2[16]; + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v0x16; + ksum16 += (uint32_t) v1x16; + ksum16 += (uint32_t) v2x16; + ksum16 += (uint32_t) v3x16; + out[64] = v0x16; + out[65] = v1x16; + out[66] = v2x16; + out[67] = v3x16; + const int8_t v0x17 = w0[17]; + const int8_t v1x17 = w1[17]; + const int8_t v2x17 = w2[17]; + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v0x17; + ksum17 += (uint32_t) v1x17; + ksum17 += (uint32_t) v2x17; + ksum17 += (uint32_t) v3x17; + out[68] = v0x17; + out[69] = v1x17; + out[70] = v2x17; + out[71] = v3x17; + const int8_t v0x18 = w0[18]; + const int8_t v1x18 = w1[18]; + const int8_t v2x18 = w2[18]; + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v0x18; + ksum18 += (uint32_t) v1x18; + ksum18 += (uint32_t) v2x18; + ksum18 += (uint32_t) v3x18; + out[72] = v0x18; + out[73] = v1x18; + out[74] = v2x18; + out[75] = v3x18; + const int8_t v0x19 = w0[19]; + const int8_t v1x19 = w1[19]; + const int8_t v2x19 = w2[19]; + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v0x19; + ksum19 += (uint32_t) v1x19; + ksum19 += (uint32_t) v2x19; + ksum19 += (uint32_t) v3x19; + out[76] = v0x19; + out[77] = v1x19; + out[78] = v2x19; + out[79] = v3x19; + const int8_t v0x20 = w0[20]; + const int8_t v1x20 = w1[20]; + const int8_t v2x20 = w2[20]; + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v0x20; + ksum20 += (uint32_t) v1x20; + ksum20 += (uint32_t) v2x20; + ksum20 += (uint32_t) v3x20; + out[80] = v0x20; + out[81] = v1x20; + out[82] = v2x20; + out[83] = v3x20; + const int8_t v0x21 = w0[21]; + const int8_t v1x21 = w1[21]; + const int8_t v2x21 = w2[21]; + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v0x21; + ksum21 += (uint32_t) v1x21; + ksum21 += (uint32_t) v2x21; + ksum21 += (uint32_t) v3x21; + out[84] = v0x21; + out[85] = v1x21; + out[86] = v2x21; + out[87] = v3x21; + const int8_t v0x22 = w0[22]; + const int8_t v1x22 = w1[22]; + const int8_t v2x22 = w2[22]; + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v0x22; + ksum22 += (uint32_t) v1x22; + ksum22 += (uint32_t) v2x22; + ksum22 += (uint32_t) v3x22; + out[88] = v0x22; + out[89] = v1x22; + out[90] = v2x22; + out[91] = v3x22; + const int8_t v0x23 = w0[23]; + const int8_t v1x23 = w1[23]; + const int8_t v2x23 = w2[23]; + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v0x23; + ksum23 += (uint32_t) v1x23; + ksum23 += (uint32_t) v2x23; + ksum23 += (uint32_t) v3x23; + out[92] = v0x23; + out[93] = v1x23; + out[94] = v2x23; + out[95] = v3x23; + const int8_t v0x24 = w0[24]; + const int8_t v1x24 = w1[24]; + const int8_t v2x24 = w2[24]; + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v0x24; + ksum24 += (uint32_t) v1x24; + ksum24 += (uint32_t) v2x24; + ksum24 += (uint32_t) v3x24; + out[96] = v0x24; + out[97] = v1x24; + out[98] = v2x24; + out[99] = v3x24; + const int8_t v0x25 = w0[25]; + const int8_t v1x25 = w1[25]; + const int8_t v2x25 = w2[25]; + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v0x25; + ksum25 += (uint32_t) v1x25; + ksum25 += (uint32_t) v2x25; + ksum25 += (uint32_t) v3x25; + out[100] = v0x25; + out[101] = v1x25; + out[102] = v2x25; + out[103] = v3x25; + const int8_t v0x26 = w0[26]; + const int8_t v1x26 = w1[26]; + const int8_t v2x26 = w2[26]; + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v0x26; + ksum26 += (uint32_t) v1x26; + ksum26 += (uint32_t) v2x26; + ksum26 += (uint32_t) v3x26; + out[104] = v0x26; + out[105] = v1x26; + out[106] = v2x26; + out[107] = v3x26; + const int8_t v0x27 = w0[27]; + const int8_t v1x27 = w1[27]; + const int8_t v2x27 = w2[27]; + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v0x27; + ksum27 += (uint32_t) v1x27; + ksum27 += (uint32_t) v2x27; + ksum27 += (uint32_t) v3x27; + out[108] = v0x27; + out[109] = v1x27; + out[110] = v2x27; + out[111] = v3x27; + const int8_t v0x28 = w0[28]; + const int8_t v1x28 = w1[28]; + const int8_t v2x28 = w2[28]; + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v0x28; + ksum28 += (uint32_t) v1x28; + ksum28 += (uint32_t) v2x28; + ksum28 += (uint32_t) v3x28; + out[112] = v0x28; + out[113] = v1x28; + out[114] = v2x28; + out[115] = v3x28; + const int8_t v0x29 = w0[29]; + const int8_t v1x29 = w1[29]; + const int8_t v2x29 = w2[29]; + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v0x29; + ksum29 += (uint32_t) v1x29; + ksum29 += (uint32_t) v2x29; + ksum29 += (uint32_t) v3x29; + out[116] = v0x29; + out[117] = v1x29; + out[118] = v2x29; + out[119] = v3x29; + const int8_t v0x30 = w0[30]; + const int8_t v1x30 = w1[30]; + const int8_t v2x30 = w2[30]; + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v0x30; + ksum30 += (uint32_t) v1x30; + ksum30 += (uint32_t) v2x30; + ksum30 += (uint32_t) v3x30; + out[120] = v0x30; + out[121] = v1x30; + out[122] = v2x30; + out[123] = v3x30; + const int8_t v0x31 = w0[31]; + const int8_t v1x31 = w1[31]; + const int8_t v2x31 = w2[31]; + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v0x31; + ksum31 += (uint32_t) v1x31; + ksum31 += (uint32_t) v2x31; + ksum31 += (uint32_t) v3x31; + out[124] = v0x31; + out[125] = v1x31; + out[126] = v2x31; + out[127] = v3x31; + const int8_t v0x32 = w0[32]; + const int8_t v1x32 = w1[32]; + const int8_t v2x32 = w2[32]; + const int8_t v3x32 = w3[32]; + ksum32 += (uint32_t) v0x32; + ksum32 += (uint32_t) v1x32; + ksum32 += (uint32_t) v2x32; + ksum32 += (uint32_t) v3x32; + out[128] = v0x32; + out[129] = v1x32; + out[130] = v2x32; + out[131] = v3x32; + const int8_t v0x33 = w0[33]; + const int8_t v1x33 = w1[33]; + const int8_t v2x33 = w2[33]; + const int8_t v3x33 = w3[33]; + ksum33 += (uint32_t) v0x33; + ksum33 += (uint32_t) v1x33; + ksum33 += (uint32_t) v2x33; + ksum33 += (uint32_t) v3x33; + out[132] = v0x33; + out[133] = v1x33; + out[134] = v2x33; + out[135] = v3x33; + const int8_t v0x34 = w0[34]; + const int8_t v1x34 = w1[34]; + const int8_t v2x34 = w2[34]; + const int8_t v3x34 = w3[34]; + ksum34 += (uint32_t) v0x34; + ksum34 += (uint32_t) v1x34; + ksum34 += (uint32_t) v2x34; + ksum34 += (uint32_t) v3x34; + out[136] = v0x34; + out[137] = v1x34; + out[138] = v2x34; + out[139] = v3x34; + const int8_t v0x35 = w0[35]; + const int8_t v1x35 = w1[35]; + const int8_t v2x35 = w2[35]; + const int8_t v3x35 = w3[35]; + ksum35 += (uint32_t) v0x35; + ksum35 += (uint32_t) v1x35; + ksum35 += (uint32_t) v2x35; + ksum35 += (uint32_t) v3x35; + out[140] = v0x35; + out[141] = v1x35; + out[142] = v2x35; + out[143] = v3x35; + const int8_t v0x36 = w0[36]; + const int8_t v1x36 = w1[36]; + const int8_t v2x36 = w2[36]; + const int8_t v3x36 = w3[36]; + ksum36 += (uint32_t) v0x36; + ksum36 += (uint32_t) v1x36; + ksum36 += (uint32_t) v2x36; + ksum36 += (uint32_t) v3x36; + out[144] = v0x36; + out[145] = v1x36; + out[146] = v2x36; + out[147] = v3x36; + const int8_t v0x37 = w0[37]; + const int8_t v1x37 = w1[37]; + const int8_t v2x37 = w2[37]; + const int8_t v3x37 = w3[37]; + ksum37 += (uint32_t) v0x37; + ksum37 += (uint32_t) v1x37; + ksum37 += (uint32_t) v2x37; + ksum37 += (uint32_t) v3x37; + out[148] = v0x37; + out[149] = v1x37; + out[150] = v2x37; + out[151] = v3x37; + const int8_t v0x38 = w0[38]; + const int8_t v1x38 = w1[38]; + const int8_t v2x38 = w2[38]; + const int8_t v3x38 = w3[38]; + ksum38 += (uint32_t) v0x38; + ksum38 += (uint32_t) v1x38; + ksum38 += (uint32_t) v2x38; + ksum38 += (uint32_t) v3x38; + out[152] = v0x38; + out[153] = v1x38; + out[154] = v2x38; + out[155] = v3x38; + const int8_t v0x39 = w0[39]; + const int8_t v1x39 = w1[39]; + const int8_t v2x39 = w2[39]; + const int8_t v3x39 = w3[39]; + ksum39 += (uint32_t) v0x39; + ksum39 += (uint32_t) v1x39; + ksum39 += (uint32_t) v2x39; + ksum39 += (uint32_t) v3x39; + out[156] = v0x39; + out[157] = v1x39; + out[158] = v2x39; + out[159] = v3x39; + const int8_t v0x40 = w0[40]; + const int8_t v1x40 = w1[40]; + const int8_t v2x40 = w2[40]; + const int8_t v3x40 = w3[40]; + ksum40 += (uint32_t) v0x40; + ksum40 += (uint32_t) v1x40; + ksum40 += (uint32_t) v2x40; + ksum40 += (uint32_t) v3x40; + out[160] = v0x40; + out[161] = v1x40; + out[162] = v2x40; + out[163] = v3x40; + const int8_t v0x41 = w0[41]; + const int8_t v1x41 = w1[41]; + const int8_t v2x41 = w2[41]; + const int8_t v3x41 = w3[41]; + ksum41 += (uint32_t) v0x41; + ksum41 += (uint32_t) v1x41; + ksum41 += (uint32_t) v2x41; + ksum41 += (uint32_t) v3x41; + out[164] = v0x41; + out[165] = v1x41; + out[166] = v2x41; + out[167] = v3x41; + const int8_t v0x42 = w0[42]; + const int8_t v1x42 = w1[42]; + const int8_t v2x42 = w2[42]; + const int8_t v3x42 = w3[42]; + ksum42 += (uint32_t) v0x42; + ksum42 += (uint32_t) v1x42; + ksum42 += (uint32_t) v2x42; + ksum42 += (uint32_t) v3x42; + out[168] = v0x42; + out[169] = v1x42; + out[170] = v2x42; + out[171] = v3x42; + const int8_t v0x43 = w0[43]; + const int8_t v1x43 = w1[43]; + const int8_t v2x43 = w2[43]; + const int8_t v3x43 = w3[43]; + ksum43 += (uint32_t) v0x43; + ksum43 += (uint32_t) v1x43; + ksum43 += (uint32_t) v2x43; + ksum43 += (uint32_t) v3x43; + out[172] = v0x43; + out[173] = v1x43; + out[174] = v2x43; + out[175] = v3x43; + const int8_t v0x44 = w0[44]; + const int8_t v1x44 = w1[44]; + const int8_t v2x44 = w2[44]; + const int8_t v3x44 = w3[44]; + ksum44 += (uint32_t) v0x44; + ksum44 += (uint32_t) v1x44; + ksum44 += (uint32_t) v2x44; + ksum44 += (uint32_t) v3x44; + out[176] = v0x44; + out[177] = v1x44; + out[178] = v2x44; + out[179] = v3x44; + const int8_t v0x45 = w0[45]; + const int8_t v1x45 = w1[45]; + const int8_t v2x45 = w2[45]; + const int8_t v3x45 = w3[45]; + ksum45 += (uint32_t) v0x45; + ksum45 += (uint32_t) v1x45; + ksum45 += (uint32_t) v2x45; + ksum45 += (uint32_t) v3x45; + out[180] = v0x45; + out[181] = v1x45; + out[182] = v2x45; + out[183] = v3x45; + const int8_t v0x46 = w0[46]; + const int8_t v1x46 = w1[46]; + const int8_t v2x46 = w2[46]; + const int8_t v3x46 = w3[46]; + ksum46 += (uint32_t) v0x46; + ksum46 += (uint32_t) v1x46; + ksum46 += (uint32_t) v2x46; + ksum46 += (uint32_t) v3x46; + out[184] = v0x46; + out[185] = v1x46; + out[186] = v2x46; + out[187] = v3x46; + const int8_t v0x47 = w0[47]; + const int8_t v1x47 = w1[47]; + const int8_t v2x47 = w2[47]; + const int8_t v3x47 = w3[47]; + ksum47 += (uint32_t) v0x47; + ksum47 += (uint32_t) v1x47; + ksum47 += (uint32_t) v2x47; + ksum47 += (uint32_t) v3x47; + out[188] = v0x47; + out[189] = v1x47; + out[190] = v2x47; + out[191] = v3x47; + const int8_t v0x48 = w0[48]; + const int8_t v1x48 = w1[48]; + const int8_t v2x48 = w2[48]; + const int8_t v3x48 = w3[48]; + ksum48 += (uint32_t) v0x48; + ksum48 += (uint32_t) v1x48; + ksum48 += (uint32_t) v2x48; + ksum48 += (uint32_t) v3x48; + out[192] = v0x48; + out[193] = v1x48; + out[194] = v2x48; + out[195] = v3x48; + const int8_t v0x49 = w0[49]; + const int8_t v1x49 = w1[49]; + const int8_t v2x49 = w2[49]; + const int8_t v3x49 = w3[49]; + ksum49 += (uint32_t) v0x49; + ksum49 += (uint32_t) v1x49; + ksum49 += (uint32_t) v2x49; + ksum49 += (uint32_t) v3x49; + out[196] = v0x49; + out[197] = v1x49; + out[198] = v2x49; + out[199] = v3x49; + const int8_t v0x50 = w0[50]; + const int8_t v1x50 = w1[50]; + const int8_t v2x50 = w2[50]; + const int8_t v3x50 = w3[50]; + ksum50 += (uint32_t) v0x50; + ksum50 += (uint32_t) v1x50; + ksum50 += (uint32_t) v2x50; + ksum50 += (uint32_t) v3x50; + out[200] = v0x50; + out[201] = v1x50; + out[202] = v2x50; + out[203] = v3x50; + const int8_t v0x51 = w0[51]; + const int8_t v1x51 = w1[51]; + const int8_t v2x51 = w2[51]; + const int8_t v3x51 = w3[51]; + ksum51 += (uint32_t) v0x51; + ksum51 += (uint32_t) v1x51; + ksum51 += (uint32_t) v2x51; + ksum51 += (uint32_t) v3x51; + out[204] = v0x51; + out[205] = v1x51; + out[206] = v2x51; + out[207] = v3x51; + const int8_t v0x52 = w0[52]; + const int8_t v1x52 = w1[52]; + const int8_t v2x52 = w2[52]; + const int8_t v3x52 = w3[52]; + ksum52 += (uint32_t) v0x52; + ksum52 += (uint32_t) v1x52; + ksum52 += (uint32_t) v2x52; + ksum52 += (uint32_t) v3x52; + out[208] = v0x52; + out[209] = v1x52; + out[210] = v2x52; + out[211] = v3x52; + const int8_t v0x53 = w0[53]; + const int8_t v1x53 = w1[53]; + const int8_t v2x53 = w2[53]; + const int8_t v3x53 = w3[53]; + ksum53 += (uint32_t) v0x53; + ksum53 += (uint32_t) v1x53; + ksum53 += (uint32_t) v2x53; + ksum53 += (uint32_t) v3x53; + out[212] = v0x53; + out[213] = v1x53; + out[214] = v2x53; + out[215] = v3x53; + const int8_t v0x54 = w0[54]; + const int8_t v1x54 = w1[54]; + const int8_t v2x54 = w2[54]; + const int8_t v3x54 = w3[54]; + ksum54 += (uint32_t) v0x54; + ksum54 += (uint32_t) v1x54; + ksum54 += (uint32_t) v2x54; + ksum54 += (uint32_t) v3x54; + out[216] = v0x54; + out[217] = v1x54; + out[218] = v2x54; + out[219] = v3x54; + const int8_t v0x55 = w0[55]; + const int8_t v1x55 = w1[55]; + const int8_t v2x55 = w2[55]; + const int8_t v3x55 = w3[55]; + ksum55 += (uint32_t) v0x55; + ksum55 += (uint32_t) v1x55; + ksum55 += (uint32_t) v2x55; + ksum55 += (uint32_t) v3x55; + out[220] = v0x55; + out[221] = v1x55; + out[222] = v2x55; + out[223] = v3x55; + const int8_t v0x56 = w0[56]; + const int8_t v1x56 = w1[56]; + const int8_t v2x56 = w2[56]; + const int8_t v3x56 = w3[56]; + ksum56 += (uint32_t) v0x56; + ksum56 += (uint32_t) v1x56; + ksum56 += (uint32_t) v2x56; + ksum56 += (uint32_t) v3x56; + out[224] = v0x56; + out[225] = v1x56; + out[226] = v2x56; + out[227] = v3x56; + const int8_t v0x57 = w0[57]; + const int8_t v1x57 = w1[57]; + const int8_t v2x57 = w2[57]; + const int8_t v3x57 = w3[57]; + ksum57 += (uint32_t) v0x57; + ksum57 += (uint32_t) v1x57; + ksum57 += (uint32_t) v2x57; + ksum57 += (uint32_t) v3x57; + out[228] = v0x57; + out[229] = v1x57; + out[230] = v2x57; + out[231] = v3x57; + const int8_t v0x58 = w0[58]; + const int8_t v1x58 = w1[58]; + const int8_t v2x58 = w2[58]; + const int8_t v3x58 = w3[58]; + ksum58 += (uint32_t) v0x58; + ksum58 += (uint32_t) v1x58; + ksum58 += (uint32_t) v2x58; + ksum58 += (uint32_t) v3x58; + out[232] = v0x58; + out[233] = v1x58; + out[234] = v2x58; + out[235] = v3x58; + const int8_t v0x59 = w0[59]; + const int8_t v1x59 = w1[59]; + const int8_t v2x59 = w2[59]; + const int8_t v3x59 = w3[59]; + ksum59 += (uint32_t) v0x59; + ksum59 += (uint32_t) v1x59; + ksum59 += (uint32_t) v2x59; + ksum59 += (uint32_t) v3x59; + out[236] = v0x59; + out[237] = v1x59; + out[238] = v2x59; + out[239] = v3x59; + const int8_t v0x60 = w0[60]; + const int8_t v1x60 = w1[60]; + const int8_t v2x60 = w2[60]; + const int8_t v3x60 = w3[60]; + ksum60 += (uint32_t) v0x60; + ksum60 += (uint32_t) v1x60; + ksum60 += (uint32_t) v2x60; + ksum60 += (uint32_t) v3x60; + out[240] = v0x60; + out[241] = v1x60; + out[242] = v2x60; + out[243] = v3x60; + const int8_t v0x61 = w0[61]; + const int8_t v1x61 = w1[61]; + const int8_t v2x61 = w2[61]; + const int8_t v3x61 = w3[61]; + ksum61 += (uint32_t) v0x61; + ksum61 += (uint32_t) v1x61; + ksum61 += (uint32_t) v2x61; + ksum61 += (uint32_t) v3x61; + out[244] = v0x61; + out[245] = v1x61; + out[246] = v2x61; + out[247] = v3x61; + const int8_t v0x62 = w0[62]; + const int8_t v1x62 = w1[62]; + const int8_t v2x62 = w2[62]; + const int8_t v3x62 = w3[62]; + ksum62 += (uint32_t) v0x62; + ksum62 += (uint32_t) v1x62; + ksum62 += (uint32_t) v2x62; + ksum62 += (uint32_t) v3x62; + out[248] = v0x62; + out[249] = v1x62; + out[250] = v2x62; + out[251] = v3x62; + const int8_t v0x63 = w0[63]; + const int8_t v1x63 = w1[63]; + const int8_t v2x63 = w2[63]; + const int8_t v3x63 = w3[63]; + ksum63 += (uint32_t) v0x63; + ksum63 += (uint32_t) v1x63; + ksum63 += (uint32_t) v2x63; + ksum63 += (uint32_t) v3x63; + out[252] = v0x63; + out[253] = v1x63; + out[254] = v2x63; + out[255] = v3x63; + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 256; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[60] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[61] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[62] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[63] = v3x15; + } + const int8_t v0x16 = w0[16]; + ksum16 += (uint32_t) v0x16; + out[64] = v0x16; + if (1 < k) { + const int8_t v1x16 = w1[16]; + ksum16 += (uint32_t) v1x16; + out[65] = v1x16; + } + if (2 < k) { + const int8_t v2x16 = w2[16]; + ksum16 += (uint32_t) v2x16; + out[66] = v2x16; + } + if (3 < k) { + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v3x16; + out[67] = v3x16; + } + const int8_t v0x17 = w0[17]; + ksum17 += (uint32_t) v0x17; + out[68] = v0x17; + if (1 < k) { + const int8_t v1x17 = w1[17]; + ksum17 += (uint32_t) v1x17; + out[69] = v1x17; + } + if (2 < k) { + const int8_t v2x17 = w2[17]; + ksum17 += (uint32_t) v2x17; + out[70] = v2x17; + } + if (3 < k) { + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v3x17; + out[71] = v3x17; + } + const int8_t v0x18 = w0[18]; + ksum18 += (uint32_t) v0x18; + out[72] = v0x18; + if (1 < k) { + const int8_t v1x18 = w1[18]; + ksum18 += (uint32_t) v1x18; + out[73] = v1x18; + } + if (2 < k) { + const int8_t v2x18 = w2[18]; + ksum18 += (uint32_t) v2x18; + out[74] = v2x18; + } + if (3 < k) { + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v3x18; + out[75] = v3x18; + } + const int8_t v0x19 = w0[19]; + ksum19 += (uint32_t) v0x19; + out[76] = v0x19; + if (1 < k) { + const int8_t v1x19 = w1[19]; + ksum19 += (uint32_t) v1x19; + out[77] = v1x19; + } + if (2 < k) { + const int8_t v2x19 = w2[19]; + ksum19 += (uint32_t) v2x19; + out[78] = v2x19; + } + if (3 < k) { + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v3x19; + out[79] = v3x19; + } + const int8_t v0x20 = w0[20]; + ksum20 += (uint32_t) v0x20; + out[80] = v0x20; + if (1 < k) { + const int8_t v1x20 = w1[20]; + ksum20 += (uint32_t) v1x20; + out[81] = v1x20; + } + if (2 < k) { + const int8_t v2x20 = w2[20]; + ksum20 += (uint32_t) v2x20; + out[82] = v2x20; + } + if (3 < k) { + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v3x20; + out[83] = v3x20; + } + const int8_t v0x21 = w0[21]; + ksum21 += (uint32_t) v0x21; + out[84] = v0x21; + if (1 < k) { + const int8_t v1x21 = w1[21]; + ksum21 += (uint32_t) v1x21; + out[85] = v1x21; + } + if (2 < k) { + const int8_t v2x21 = w2[21]; + ksum21 += (uint32_t) v2x21; + out[86] = v2x21; + } + if (3 < k) { + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v3x21; + out[87] = v3x21; + } + const int8_t v0x22 = w0[22]; + ksum22 += (uint32_t) v0x22; + out[88] = v0x22; + if (1 < k) { + const int8_t v1x22 = w1[22]; + ksum22 += (uint32_t) v1x22; + out[89] = v1x22; + } + if (2 < k) { + const int8_t v2x22 = w2[22]; + ksum22 += (uint32_t) v2x22; + out[90] = v2x22; + } + if (3 < k) { + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v3x22; + out[91] = v3x22; + } + const int8_t v0x23 = w0[23]; + ksum23 += (uint32_t) v0x23; + out[92] = v0x23; + if (1 < k) { + const int8_t v1x23 = w1[23]; + ksum23 += (uint32_t) v1x23; + out[93] = v1x23; + } + if (2 < k) { + const int8_t v2x23 = w2[23]; + ksum23 += (uint32_t) v2x23; + out[94] = v2x23; + } + if (3 < k) { + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v3x23; + out[95] = v3x23; + } + const int8_t v0x24 = w0[24]; + ksum24 += (uint32_t) v0x24; + out[96] = v0x24; + if (1 < k) { + const int8_t v1x24 = w1[24]; + ksum24 += (uint32_t) v1x24; + out[97] = v1x24; + } + if (2 < k) { + const int8_t v2x24 = w2[24]; + ksum24 += (uint32_t) v2x24; + out[98] = v2x24; + } + if (3 < k) { + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v3x24; + out[99] = v3x24; + } + const int8_t v0x25 = w0[25]; + ksum25 += (uint32_t) v0x25; + out[100] = v0x25; + if (1 < k) { + const int8_t v1x25 = w1[25]; + ksum25 += (uint32_t) v1x25; + out[101] = v1x25; + } + if (2 < k) { + const int8_t v2x25 = w2[25]; + ksum25 += (uint32_t) v2x25; + out[102] = v2x25; + } + if (3 < k) { + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v3x25; + out[103] = v3x25; + } + const int8_t v0x26 = w0[26]; + ksum26 += (uint32_t) v0x26; + out[104] = v0x26; + if (1 < k) { + const int8_t v1x26 = w1[26]; + ksum26 += (uint32_t) v1x26; + out[105] = v1x26; + } + if (2 < k) { + const int8_t v2x26 = w2[26]; + ksum26 += (uint32_t) v2x26; + out[106] = v2x26; + } + if (3 < k) { + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v3x26; + out[107] = v3x26; + } + const int8_t v0x27 = w0[27]; + ksum27 += (uint32_t) v0x27; + out[108] = v0x27; + if (1 < k) { + const int8_t v1x27 = w1[27]; + ksum27 += (uint32_t) v1x27; + out[109] = v1x27; + } + if (2 < k) { + const int8_t v2x27 = w2[27]; + ksum27 += (uint32_t) v2x27; + out[110] = v2x27; + } + if (3 < k) { + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v3x27; + out[111] = v3x27; + } + const int8_t v0x28 = w0[28]; + ksum28 += (uint32_t) v0x28; + out[112] = v0x28; + if (1 < k) { + const int8_t v1x28 = w1[28]; + ksum28 += (uint32_t) v1x28; + out[113] = v1x28; + } + if (2 < k) { + const int8_t v2x28 = w2[28]; + ksum28 += (uint32_t) v2x28; + out[114] = v2x28; + } + if (3 < k) { + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v3x28; + out[115] = v3x28; + } + const int8_t v0x29 = w0[29]; + ksum29 += (uint32_t) v0x29; + out[116] = v0x29; + if (1 < k) { + const int8_t v1x29 = w1[29]; + ksum29 += (uint32_t) v1x29; + out[117] = v1x29; + } + if (2 < k) { + const int8_t v2x29 = w2[29]; + ksum29 += (uint32_t) v2x29; + out[118] = v2x29; + } + if (3 < k) { + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v3x29; + out[119] = v3x29; + } + const int8_t v0x30 = w0[30]; + ksum30 += (uint32_t) v0x30; + out[120] = v0x30; + if (1 < k) { + const int8_t v1x30 = w1[30]; + ksum30 += (uint32_t) v1x30; + out[121] = v1x30; + } + if (2 < k) { + const int8_t v2x30 = w2[30]; + ksum30 += (uint32_t) v2x30; + out[122] = v2x30; + } + if (3 < k) { + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v3x30; + out[123] = v3x30; + } + const int8_t v0x31 = w0[31]; + ksum31 += (uint32_t) v0x31; + out[124] = v0x31; + if (1 < k) { + const int8_t v1x31 = w1[31]; + ksum31 += (uint32_t) v1x31; + out[125] = v1x31; + } + if (2 < k) { + const int8_t v2x31 = w2[31]; + ksum31 += (uint32_t) v2x31; + out[126] = v2x31; + } + if (3 < k) { + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v3x31; + out[127] = v3x31; + } + const int8_t v0x32 = w0[32]; + ksum32 += (uint32_t) v0x32; + out[128] = v0x32; + if (1 < k) { + const int8_t v1x32 = w1[32]; + ksum32 += (uint32_t) v1x32; + out[129] = v1x32; + } + if (2 < k) { + const int8_t v2x32 = w2[32]; + ksum32 += (uint32_t) v2x32; + out[130] = v2x32; + } + if (3 < k) { + const int8_t v3x32 = w3[32]; + ksum32 += (uint32_t) v3x32; + out[131] = v3x32; + } + const int8_t v0x33 = w0[33]; + ksum33 += (uint32_t) v0x33; + out[132] = v0x33; + if (1 < k) { + const int8_t v1x33 = w1[33]; + ksum33 += (uint32_t) v1x33; + out[133] = v1x33; + } + if (2 < k) { + const int8_t v2x33 = w2[33]; + ksum33 += (uint32_t) v2x33; + out[134] = v2x33; + } + if (3 < k) { + const int8_t v3x33 = w3[33]; + ksum33 += (uint32_t) v3x33; + out[135] = v3x33; + } + const int8_t v0x34 = w0[34]; + ksum34 += (uint32_t) v0x34; + out[136] = v0x34; + if (1 < k) { + const int8_t v1x34 = w1[34]; + ksum34 += (uint32_t) v1x34; + out[137] = v1x34; + } + if (2 < k) { + const int8_t v2x34 = w2[34]; + ksum34 += (uint32_t) v2x34; + out[138] = v2x34; + } + if (3 < k) { + const int8_t v3x34 = w3[34]; + ksum34 += (uint32_t) v3x34; + out[139] = v3x34; + } + const int8_t v0x35 = w0[35]; + ksum35 += (uint32_t) v0x35; + out[140] = v0x35; + if (1 < k) { + const int8_t v1x35 = w1[35]; + ksum35 += (uint32_t) v1x35; + out[141] = v1x35; + } + if (2 < k) { + const int8_t v2x35 = w2[35]; + ksum35 += (uint32_t) v2x35; + out[142] = v2x35; + } + if (3 < k) { + const int8_t v3x35 = w3[35]; + ksum35 += (uint32_t) v3x35; + out[143] = v3x35; + } + const int8_t v0x36 = w0[36]; + ksum36 += (uint32_t) v0x36; + out[144] = v0x36; + if (1 < k) { + const int8_t v1x36 = w1[36]; + ksum36 += (uint32_t) v1x36; + out[145] = v1x36; + } + if (2 < k) { + const int8_t v2x36 = w2[36]; + ksum36 += (uint32_t) v2x36; + out[146] = v2x36; + } + if (3 < k) { + const int8_t v3x36 = w3[36]; + ksum36 += (uint32_t) v3x36; + out[147] = v3x36; + } + const int8_t v0x37 = w0[37]; + ksum37 += (uint32_t) v0x37; + out[148] = v0x37; + if (1 < k) { + const int8_t v1x37 = w1[37]; + ksum37 += (uint32_t) v1x37; + out[149] = v1x37; + } + if (2 < k) { + const int8_t v2x37 = w2[37]; + ksum37 += (uint32_t) v2x37; + out[150] = v2x37; + } + if (3 < k) { + const int8_t v3x37 = w3[37]; + ksum37 += (uint32_t) v3x37; + out[151] = v3x37; + } + const int8_t v0x38 = w0[38]; + ksum38 += (uint32_t) v0x38; + out[152] = v0x38; + if (1 < k) { + const int8_t v1x38 = w1[38]; + ksum38 += (uint32_t) v1x38; + out[153] = v1x38; + } + if (2 < k) { + const int8_t v2x38 = w2[38]; + ksum38 += (uint32_t) v2x38; + out[154] = v2x38; + } + if (3 < k) { + const int8_t v3x38 = w3[38]; + ksum38 += (uint32_t) v3x38; + out[155] = v3x38; + } + const int8_t v0x39 = w0[39]; + ksum39 += (uint32_t) v0x39; + out[156] = v0x39; + if (1 < k) { + const int8_t v1x39 = w1[39]; + ksum39 += (uint32_t) v1x39; + out[157] = v1x39; + } + if (2 < k) { + const int8_t v2x39 = w2[39]; + ksum39 += (uint32_t) v2x39; + out[158] = v2x39; + } + if (3 < k) { + const int8_t v3x39 = w3[39]; + ksum39 += (uint32_t) v3x39; + out[159] = v3x39; + } + const int8_t v0x40 = w0[40]; + ksum40 += (uint32_t) v0x40; + out[160] = v0x40; + if (1 < k) { + const int8_t v1x40 = w1[40]; + ksum40 += (uint32_t) v1x40; + out[161] = v1x40; + } + if (2 < k) { + const int8_t v2x40 = w2[40]; + ksum40 += (uint32_t) v2x40; + out[162] = v2x40; + } + if (3 < k) { + const int8_t v3x40 = w3[40]; + ksum40 += (uint32_t) v3x40; + out[163] = v3x40; + } + const int8_t v0x41 = w0[41]; + ksum41 += (uint32_t) v0x41; + out[164] = v0x41; + if (1 < k) { + const int8_t v1x41 = w1[41]; + ksum41 += (uint32_t) v1x41; + out[165] = v1x41; + } + if (2 < k) { + const int8_t v2x41 = w2[41]; + ksum41 += (uint32_t) v2x41; + out[166] = v2x41; + } + if (3 < k) { + const int8_t v3x41 = w3[41]; + ksum41 += (uint32_t) v3x41; + out[167] = v3x41; + } + const int8_t v0x42 = w0[42]; + ksum42 += (uint32_t) v0x42; + out[168] = v0x42; + if (1 < k) { + const int8_t v1x42 = w1[42]; + ksum42 += (uint32_t) v1x42; + out[169] = v1x42; + } + if (2 < k) { + const int8_t v2x42 = w2[42]; + ksum42 += (uint32_t) v2x42; + out[170] = v2x42; + } + if (3 < k) { + const int8_t v3x42 = w3[42]; + ksum42 += (uint32_t) v3x42; + out[171] = v3x42; + } + const int8_t v0x43 = w0[43]; + ksum43 += (uint32_t) v0x43; + out[172] = v0x43; + if (1 < k) { + const int8_t v1x43 = w1[43]; + ksum43 += (uint32_t) v1x43; + out[173] = v1x43; + } + if (2 < k) { + const int8_t v2x43 = w2[43]; + ksum43 += (uint32_t) v2x43; + out[174] = v2x43; + } + if (3 < k) { + const int8_t v3x43 = w3[43]; + ksum43 += (uint32_t) v3x43; + out[175] = v3x43; + } + const int8_t v0x44 = w0[44]; + ksum44 += (uint32_t) v0x44; + out[176] = v0x44; + if (1 < k) { + const int8_t v1x44 = w1[44]; + ksum44 += (uint32_t) v1x44; + out[177] = v1x44; + } + if (2 < k) { + const int8_t v2x44 = w2[44]; + ksum44 += (uint32_t) v2x44; + out[178] = v2x44; + } + if (3 < k) { + const int8_t v3x44 = w3[44]; + ksum44 += (uint32_t) v3x44; + out[179] = v3x44; + } + const int8_t v0x45 = w0[45]; + ksum45 += (uint32_t) v0x45; + out[180] = v0x45; + if (1 < k) { + const int8_t v1x45 = w1[45]; + ksum45 += (uint32_t) v1x45; + out[181] = v1x45; + } + if (2 < k) { + const int8_t v2x45 = w2[45]; + ksum45 += (uint32_t) v2x45; + out[182] = v2x45; + } + if (3 < k) { + const int8_t v3x45 = w3[45]; + ksum45 += (uint32_t) v3x45; + out[183] = v3x45; + } + const int8_t v0x46 = w0[46]; + ksum46 += (uint32_t) v0x46; + out[184] = v0x46; + if (1 < k) { + const int8_t v1x46 = w1[46]; + ksum46 += (uint32_t) v1x46; + out[185] = v1x46; + } + if (2 < k) { + const int8_t v2x46 = w2[46]; + ksum46 += (uint32_t) v2x46; + out[186] = v2x46; + } + if (3 < k) { + const int8_t v3x46 = w3[46]; + ksum46 += (uint32_t) v3x46; + out[187] = v3x46; + } + const int8_t v0x47 = w0[47]; + ksum47 += (uint32_t) v0x47; + out[188] = v0x47; + if (1 < k) { + const int8_t v1x47 = w1[47]; + ksum47 += (uint32_t) v1x47; + out[189] = v1x47; + } + if (2 < k) { + const int8_t v2x47 = w2[47]; + ksum47 += (uint32_t) v2x47; + out[190] = v2x47; + } + if (3 < k) { + const int8_t v3x47 = w3[47]; + ksum47 += (uint32_t) v3x47; + out[191] = v3x47; + } + const int8_t v0x48 = w0[48]; + ksum48 += (uint32_t) v0x48; + out[192] = v0x48; + if (1 < k) { + const int8_t v1x48 = w1[48]; + ksum48 += (uint32_t) v1x48; + out[193] = v1x48; + } + if (2 < k) { + const int8_t v2x48 = w2[48]; + ksum48 += (uint32_t) v2x48; + out[194] = v2x48; + } + if (3 < k) { + const int8_t v3x48 = w3[48]; + ksum48 += (uint32_t) v3x48; + out[195] = v3x48; + } + const int8_t v0x49 = w0[49]; + ksum49 += (uint32_t) v0x49; + out[196] = v0x49; + if (1 < k) { + const int8_t v1x49 = w1[49]; + ksum49 += (uint32_t) v1x49; + out[197] = v1x49; + } + if (2 < k) { + const int8_t v2x49 = w2[49]; + ksum49 += (uint32_t) v2x49; + out[198] = v2x49; + } + if (3 < k) { + const int8_t v3x49 = w3[49]; + ksum49 += (uint32_t) v3x49; + out[199] = v3x49; + } + const int8_t v0x50 = w0[50]; + ksum50 += (uint32_t) v0x50; + out[200] = v0x50; + if (1 < k) { + const int8_t v1x50 = w1[50]; + ksum50 += (uint32_t) v1x50; + out[201] = v1x50; + } + if (2 < k) { + const int8_t v2x50 = w2[50]; + ksum50 += (uint32_t) v2x50; + out[202] = v2x50; + } + if (3 < k) { + const int8_t v3x50 = w3[50]; + ksum50 += (uint32_t) v3x50; + out[203] = v3x50; + } + const int8_t v0x51 = w0[51]; + ksum51 += (uint32_t) v0x51; + out[204] = v0x51; + if (1 < k) { + const int8_t v1x51 = w1[51]; + ksum51 += (uint32_t) v1x51; + out[205] = v1x51; + } + if (2 < k) { + const int8_t v2x51 = w2[51]; + ksum51 += (uint32_t) v2x51; + out[206] = v2x51; + } + if (3 < k) { + const int8_t v3x51 = w3[51]; + ksum51 += (uint32_t) v3x51; + out[207] = v3x51; + } + const int8_t v0x52 = w0[52]; + ksum52 += (uint32_t) v0x52; + out[208] = v0x52; + if (1 < k) { + const int8_t v1x52 = w1[52]; + ksum52 += (uint32_t) v1x52; + out[209] = v1x52; + } + if (2 < k) { + const int8_t v2x52 = w2[52]; + ksum52 += (uint32_t) v2x52; + out[210] = v2x52; + } + if (3 < k) { + const int8_t v3x52 = w3[52]; + ksum52 += (uint32_t) v3x52; + out[211] = v3x52; + } + const int8_t v0x53 = w0[53]; + ksum53 += (uint32_t) v0x53; + out[212] = v0x53; + if (1 < k) { + const int8_t v1x53 = w1[53]; + ksum53 += (uint32_t) v1x53; + out[213] = v1x53; + } + if (2 < k) { + const int8_t v2x53 = w2[53]; + ksum53 += (uint32_t) v2x53; + out[214] = v2x53; + } + if (3 < k) { + const int8_t v3x53 = w3[53]; + ksum53 += (uint32_t) v3x53; + out[215] = v3x53; + } + const int8_t v0x54 = w0[54]; + ksum54 += (uint32_t) v0x54; + out[216] = v0x54; + if (1 < k) { + const int8_t v1x54 = w1[54]; + ksum54 += (uint32_t) v1x54; + out[217] = v1x54; + } + if (2 < k) { + const int8_t v2x54 = w2[54]; + ksum54 += (uint32_t) v2x54; + out[218] = v2x54; + } + if (3 < k) { + const int8_t v3x54 = w3[54]; + ksum54 += (uint32_t) v3x54; + out[219] = v3x54; + } + const int8_t v0x55 = w0[55]; + ksum55 += (uint32_t) v0x55; + out[220] = v0x55; + if (1 < k) { + const int8_t v1x55 = w1[55]; + ksum55 += (uint32_t) v1x55; + out[221] = v1x55; + } + if (2 < k) { + const int8_t v2x55 = w2[55]; + ksum55 += (uint32_t) v2x55; + out[222] = v2x55; + } + if (3 < k) { + const int8_t v3x55 = w3[55]; + ksum55 += (uint32_t) v3x55; + out[223] = v3x55; + } + const int8_t v0x56 = w0[56]; + ksum56 += (uint32_t) v0x56; + out[224] = v0x56; + if (1 < k) { + const int8_t v1x56 = w1[56]; + ksum56 += (uint32_t) v1x56; + out[225] = v1x56; + } + if (2 < k) { + const int8_t v2x56 = w2[56]; + ksum56 += (uint32_t) v2x56; + out[226] = v2x56; + } + if (3 < k) { + const int8_t v3x56 = w3[56]; + ksum56 += (uint32_t) v3x56; + out[227] = v3x56; + } + const int8_t v0x57 = w0[57]; + ksum57 += (uint32_t) v0x57; + out[228] = v0x57; + if (1 < k) { + const int8_t v1x57 = w1[57]; + ksum57 += (uint32_t) v1x57; + out[229] = v1x57; + } + if (2 < k) { + const int8_t v2x57 = w2[57]; + ksum57 += (uint32_t) v2x57; + out[230] = v2x57; + } + if (3 < k) { + const int8_t v3x57 = w3[57]; + ksum57 += (uint32_t) v3x57; + out[231] = v3x57; + } + const int8_t v0x58 = w0[58]; + ksum58 += (uint32_t) v0x58; + out[232] = v0x58; + if (1 < k) { + const int8_t v1x58 = w1[58]; + ksum58 += (uint32_t) v1x58; + out[233] = v1x58; + } + if (2 < k) { + const int8_t v2x58 = w2[58]; + ksum58 += (uint32_t) v2x58; + out[234] = v2x58; + } + if (3 < k) { + const int8_t v3x58 = w3[58]; + ksum58 += (uint32_t) v3x58; + out[235] = v3x58; + } + const int8_t v0x59 = w0[59]; + ksum59 += (uint32_t) v0x59; + out[236] = v0x59; + if (1 < k) { + const int8_t v1x59 = w1[59]; + ksum59 += (uint32_t) v1x59; + out[237] = v1x59; + } + if (2 < k) { + const int8_t v2x59 = w2[59]; + ksum59 += (uint32_t) v2x59; + out[238] = v2x59; + } + if (3 < k) { + const int8_t v3x59 = w3[59]; + ksum59 += (uint32_t) v3x59; + out[239] = v3x59; + } + const int8_t v0x60 = w0[60]; + ksum60 += (uint32_t) v0x60; + out[240] = v0x60; + if (1 < k) { + const int8_t v1x60 = w1[60]; + ksum60 += (uint32_t) v1x60; + out[241] = v1x60; + } + if (2 < k) { + const int8_t v2x60 = w2[60]; + ksum60 += (uint32_t) v2x60; + out[242] = v2x60; + } + if (3 < k) { + const int8_t v3x60 = w3[60]; + ksum60 += (uint32_t) v3x60; + out[243] = v3x60; + } + const int8_t v0x61 = w0[61]; + ksum61 += (uint32_t) v0x61; + out[244] = v0x61; + if (1 < k) { + const int8_t v1x61 = w1[61]; + ksum61 += (uint32_t) v1x61; + out[245] = v1x61; + } + if (2 < k) { + const int8_t v2x61 = w2[61]; + ksum61 += (uint32_t) v2x61; + out[246] = v2x61; + } + if (3 < k) { + const int8_t v3x61 = w3[61]; + ksum61 += (uint32_t) v3x61; + out[247] = v3x61; + } + const int8_t v0x62 = w0[62]; + ksum62 += (uint32_t) v0x62; + out[248] = v0x62; + if (1 < k) { + const int8_t v1x62 = w1[62]; + ksum62 += (uint32_t) v1x62; + out[249] = v1x62; + } + if (2 < k) { + const int8_t v2x62 = w2[62]; + ksum62 += (uint32_t) v2x62; + out[250] = v2x62; + } + if (3 < k) { + const int8_t v3x62 = w3[62]; + ksum62 += (uint32_t) v3x62; + out[251] = v3x62; + } + const int8_t v0x63 = w0[63]; + ksum63 += (uint32_t) v0x63; + out[252] = v0x63; + if (1 < k) { + const int8_t v1x63 = w1[63]; + ksum63 += (uint32_t) v1x63; + out[253] = v1x63; + } + if (2 < k) { + const int8_t v2x63 = w2[63]; + ksum63 += (uint32_t) v2x63; + out[254] = v2x63; + } + if (3 < k) { + const int8_t v3x63 = w3[63]; + ksum63 += (uint32_t) v3x63; + out[255] = v3x63; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 256; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + packed_b[16] -= ksum16 * izp; + packed_b[17] -= ksum17 * izp; + packed_b[18] -= ksum18 * izp; + packed_b[19] -= ksum19 * izp; + packed_b[20] -= ksum20 * izp; + packed_b[21] -= ksum21 * izp; + packed_b[22] -= ksum22 * izp; + packed_b[23] -= ksum23 * izp; + packed_b[24] -= ksum24 * izp; + packed_b[25] -= ksum25 * izp; + packed_b[26] -= ksum26 * izp; + packed_b[27] -= ksum27 * izp; + packed_b[28] -= ksum28 * izp; + packed_b[29] -= ksum29 * izp; + packed_b[30] -= ksum30 * izp; + packed_b[31] -= ksum31 * izp; + packed_b[32] -= ksum32 * izp; + packed_b[33] -= ksum33 * izp; + packed_b[34] -= ksum34 * izp; + packed_b[35] -= ksum35 * izp; + packed_b[36] -= ksum36 * izp; + packed_b[37] -= ksum37 * izp; + packed_b[38] -= ksum38 * izp; + packed_b[39] -= ksum39 * izp; + packed_b[40] -= ksum40 * izp; + packed_b[41] -= ksum41 * izp; + packed_b[42] -= ksum42 * izp; + packed_b[43] -= ksum43 * izp; + packed_b[44] -= ksum44 * izp; + packed_b[45] -= ksum45 * izp; + packed_b[46] -= ksum46 * izp; + packed_b[47] -= ksum47 * izp; + packed_b[48] -= ksum48 * izp; + packed_b[49] -= ksum49 * izp; + packed_b[50] -= ksum50 * izp; + packed_b[51] -= ksum51 * izp; + packed_b[52] -= ksum52 * izp; + packed_b[53] -= ksum53 * izp; + packed_b[54] -= ksum54 * izp; + packed_b[55] -= ksum55 * izp; + packed_b[56] -= ksum56 * izp; + packed_b[57] -= ksum57 * izp; + packed_b[58] -= ksum58 * izp; + packed_b[59] -= ksum59 * izp; + packed_b[60] -= ksum60 * izp; + packed_b[61] -= ksum61 * izp; + packed_b[62] -= ksum62 * izp; + packed_b[63] -= ksum63 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 64; + } + + // NC remainder (1..63) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (64 - n) * sizeof(int32_t); + + // NR remainder has less than 64 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + uint32_t ksum16 = 0; + uint32_t ksum17 = 0; + uint32_t ksum18 = 0; + uint32_t ksum19 = 0; + uint32_t ksum20 = 0; + uint32_t ksum21 = 0; + uint32_t ksum22 = 0; + uint32_t ksum23 = 0; + uint32_t ksum24 = 0; + uint32_t ksum25 = 0; + uint32_t ksum26 = 0; + uint32_t ksum27 = 0; + uint32_t ksum28 = 0; + uint32_t ksum29 = 0; + uint32_t ksum30 = 0; + uint32_t ksum31 = 0; + uint32_t ksum32 = 0; + uint32_t ksum33 = 0; + uint32_t ksum34 = 0; + uint32_t ksum35 = 0; + uint32_t ksum36 = 0; + uint32_t ksum37 = 0; + uint32_t ksum38 = 0; + uint32_t ksum39 = 0; + uint32_t ksum40 = 0; + uint32_t ksum41 = 0; + uint32_t ksum42 = 0; + uint32_t ksum43 = 0; + uint32_t ksum44 = 0; + uint32_t ksum45 = 0; + uint32_t ksum46 = 0; + uint32_t ksum47 = 0; + uint32_t ksum48 = 0; + uint32_t ksum49 = 0; + uint32_t ksum50 = 0; + uint32_t ksum51 = 0; + uint32_t ksum52 = 0; + uint32_t ksum53 = 0; + uint32_t ksum54 = 0; + uint32_t ksum55 = 0; + uint32_t ksum56 = 0; + uint32_t ksum57 = 0; + uint32_t ksum58 = 0; + uint32_t ksum59 = 0; + uint32_t ksum60 = 0; + uint32_t ksum61 = 0; + uint32_t ksum62 = 0; + + // KC main loop multiple of 64x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + out[32] = v0x8; + out[33] = v1x8; + out[34] = v2x8; + out[35] = v3x8; + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + out[36] = v0x9; + out[37] = v1x9; + out[38] = v2x9; + out[39] = v3x9; + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + out[40] = v0x10; + out[41] = v1x10; + out[42] = v2x10; + out[43] = v3x10; + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + out[44] = v0x11; + out[45] = v1x11; + out[46] = v2x11; + out[47] = v3x11; + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + out[48] = v0x12; + out[49] = v1x12; + out[50] = v2x12; + out[51] = v3x12; + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + out[52] = v0x13; + out[53] = v1x13; + out[54] = v2x13; + out[55] = v3x13; + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + out[56] = v0x14; + out[57] = v1x14; + out[58] = v2x14; + out[59] = v3x14; + } + if (15 < n) { + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + out[60] = v0x15; + out[61] = v1x15; + out[62] = v2x15; + out[63] = v3x15; + } + if (16 < n) { + const int8_t v0x16 = w0[16]; + const int8_t v1x16 = w1[16]; + const int8_t v2x16 = w2[16]; + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v0x16; + ksum16 += (uint32_t) v1x16; + ksum16 += (uint32_t) v2x16; + ksum16 += (uint32_t) v3x16; + out[64] = v0x16; + out[65] = v1x16; + out[66] = v2x16; + out[67] = v3x16; + } + if (17 < n) { + const int8_t v0x17 = w0[17]; + const int8_t v1x17 = w1[17]; + const int8_t v2x17 = w2[17]; + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v0x17; + ksum17 += (uint32_t) v1x17; + ksum17 += (uint32_t) v2x17; + ksum17 += (uint32_t) v3x17; + out[68] = v0x17; + out[69] = v1x17; + out[70] = v2x17; + out[71] = v3x17; + } + if (18 < n) { + const int8_t v0x18 = w0[18]; + const int8_t v1x18 = w1[18]; + const int8_t v2x18 = w2[18]; + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v0x18; + ksum18 += (uint32_t) v1x18; + ksum18 += (uint32_t) v2x18; + ksum18 += (uint32_t) v3x18; + out[72] = v0x18; + out[73] = v1x18; + out[74] = v2x18; + out[75] = v3x18; + } + if (19 < n) { + const int8_t v0x19 = w0[19]; + const int8_t v1x19 = w1[19]; + const int8_t v2x19 = w2[19]; + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v0x19; + ksum19 += (uint32_t) v1x19; + ksum19 += (uint32_t) v2x19; + ksum19 += (uint32_t) v3x19; + out[76] = v0x19; + out[77] = v1x19; + out[78] = v2x19; + out[79] = v3x19; + } + if (20 < n) { + const int8_t v0x20 = w0[20]; + const int8_t v1x20 = w1[20]; + const int8_t v2x20 = w2[20]; + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v0x20; + ksum20 += (uint32_t) v1x20; + ksum20 += (uint32_t) v2x20; + ksum20 += (uint32_t) v3x20; + out[80] = v0x20; + out[81] = v1x20; + out[82] = v2x20; + out[83] = v3x20; + } + if (21 < n) { + const int8_t v0x21 = w0[21]; + const int8_t v1x21 = w1[21]; + const int8_t v2x21 = w2[21]; + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v0x21; + ksum21 += (uint32_t) v1x21; + ksum21 += (uint32_t) v2x21; + ksum21 += (uint32_t) v3x21; + out[84] = v0x21; + out[85] = v1x21; + out[86] = v2x21; + out[87] = v3x21; + } + if (22 < n) { + const int8_t v0x22 = w0[22]; + const int8_t v1x22 = w1[22]; + const int8_t v2x22 = w2[22]; + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v0x22; + ksum22 += (uint32_t) v1x22; + ksum22 += (uint32_t) v2x22; + ksum22 += (uint32_t) v3x22; + out[88] = v0x22; + out[89] = v1x22; + out[90] = v2x22; + out[91] = v3x22; + } + if (23 < n) { + const int8_t v0x23 = w0[23]; + const int8_t v1x23 = w1[23]; + const int8_t v2x23 = w2[23]; + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v0x23; + ksum23 += (uint32_t) v1x23; + ksum23 += (uint32_t) v2x23; + ksum23 += (uint32_t) v3x23; + out[92] = v0x23; + out[93] = v1x23; + out[94] = v2x23; + out[95] = v3x23; + } + if (24 < n) { + const int8_t v0x24 = w0[24]; + const int8_t v1x24 = w1[24]; + const int8_t v2x24 = w2[24]; + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v0x24; + ksum24 += (uint32_t) v1x24; + ksum24 += (uint32_t) v2x24; + ksum24 += (uint32_t) v3x24; + out[96] = v0x24; + out[97] = v1x24; + out[98] = v2x24; + out[99] = v3x24; + } + if (25 < n) { + const int8_t v0x25 = w0[25]; + const int8_t v1x25 = w1[25]; + const int8_t v2x25 = w2[25]; + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v0x25; + ksum25 += (uint32_t) v1x25; + ksum25 += (uint32_t) v2x25; + ksum25 += (uint32_t) v3x25; + out[100] = v0x25; + out[101] = v1x25; + out[102] = v2x25; + out[103] = v3x25; + } + if (26 < n) { + const int8_t v0x26 = w0[26]; + const int8_t v1x26 = w1[26]; + const int8_t v2x26 = w2[26]; + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v0x26; + ksum26 += (uint32_t) v1x26; + ksum26 += (uint32_t) v2x26; + ksum26 += (uint32_t) v3x26; + out[104] = v0x26; + out[105] = v1x26; + out[106] = v2x26; + out[107] = v3x26; + } + if (27 < n) { + const int8_t v0x27 = w0[27]; + const int8_t v1x27 = w1[27]; + const int8_t v2x27 = w2[27]; + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v0x27; + ksum27 += (uint32_t) v1x27; + ksum27 += (uint32_t) v2x27; + ksum27 += (uint32_t) v3x27; + out[108] = v0x27; + out[109] = v1x27; + out[110] = v2x27; + out[111] = v3x27; + } + if (28 < n) { + const int8_t v0x28 = w0[28]; + const int8_t v1x28 = w1[28]; + const int8_t v2x28 = w2[28]; + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v0x28; + ksum28 += (uint32_t) v1x28; + ksum28 += (uint32_t) v2x28; + ksum28 += (uint32_t) v3x28; + out[112] = v0x28; + out[113] = v1x28; + out[114] = v2x28; + out[115] = v3x28; + } + if (29 < n) { + const int8_t v0x29 = w0[29]; + const int8_t v1x29 = w1[29]; + const int8_t v2x29 = w2[29]; + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v0x29; + ksum29 += (uint32_t) v1x29; + ksum29 += (uint32_t) v2x29; + ksum29 += (uint32_t) v3x29; + out[116] = v0x29; + out[117] = v1x29; + out[118] = v2x29; + out[119] = v3x29; + } + if (30 < n) { + const int8_t v0x30 = w0[30]; + const int8_t v1x30 = w1[30]; + const int8_t v2x30 = w2[30]; + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v0x30; + ksum30 += (uint32_t) v1x30; + ksum30 += (uint32_t) v2x30; + ksum30 += (uint32_t) v3x30; + out[120] = v0x30; + out[121] = v1x30; + out[122] = v2x30; + out[123] = v3x30; + } + if (31 < n) { + const int8_t v0x31 = w0[31]; + const int8_t v1x31 = w1[31]; + const int8_t v2x31 = w2[31]; + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v0x31; + ksum31 += (uint32_t) v1x31; + ksum31 += (uint32_t) v2x31; + ksum31 += (uint32_t) v3x31; + out[124] = v0x31; + out[125] = v1x31; + out[126] = v2x31; + out[127] = v3x31; + } + if (32 < n) { + const int8_t v0x32 = w0[32]; + const int8_t v1x32 = w1[32]; + const int8_t v2x32 = w2[32]; + const int8_t v3x32 = w3[32]; + ksum32 += (uint32_t) v0x32; + ksum32 += (uint32_t) v1x32; + ksum32 += (uint32_t) v2x32; + ksum32 += (uint32_t) v3x32; + out[128] = v0x32; + out[129] = v1x32; + out[130] = v2x32; + out[131] = v3x32; + } + if (33 < n) { + const int8_t v0x33 = w0[33]; + const int8_t v1x33 = w1[33]; + const int8_t v2x33 = w2[33]; + const int8_t v3x33 = w3[33]; + ksum33 += (uint32_t) v0x33; + ksum33 += (uint32_t) v1x33; + ksum33 += (uint32_t) v2x33; + ksum33 += (uint32_t) v3x33; + out[132] = v0x33; + out[133] = v1x33; + out[134] = v2x33; + out[135] = v3x33; + } + if (34 < n) { + const int8_t v0x34 = w0[34]; + const int8_t v1x34 = w1[34]; + const int8_t v2x34 = w2[34]; + const int8_t v3x34 = w3[34]; + ksum34 += (uint32_t) v0x34; + ksum34 += (uint32_t) v1x34; + ksum34 += (uint32_t) v2x34; + ksum34 += (uint32_t) v3x34; + out[136] = v0x34; + out[137] = v1x34; + out[138] = v2x34; + out[139] = v3x34; + } + if (35 < n) { + const int8_t v0x35 = w0[35]; + const int8_t v1x35 = w1[35]; + const int8_t v2x35 = w2[35]; + const int8_t v3x35 = w3[35]; + ksum35 += (uint32_t) v0x35; + ksum35 += (uint32_t) v1x35; + ksum35 += (uint32_t) v2x35; + ksum35 += (uint32_t) v3x35; + out[140] = v0x35; + out[141] = v1x35; + out[142] = v2x35; + out[143] = v3x35; + } + if (36 < n) { + const int8_t v0x36 = w0[36]; + const int8_t v1x36 = w1[36]; + const int8_t v2x36 = w2[36]; + const int8_t v3x36 = w3[36]; + ksum36 += (uint32_t) v0x36; + ksum36 += (uint32_t) v1x36; + ksum36 += (uint32_t) v2x36; + ksum36 += (uint32_t) v3x36; + out[144] = v0x36; + out[145] = v1x36; + out[146] = v2x36; + out[147] = v3x36; + } + if (37 < n) { + const int8_t v0x37 = w0[37]; + const int8_t v1x37 = w1[37]; + const int8_t v2x37 = w2[37]; + const int8_t v3x37 = w3[37]; + ksum37 += (uint32_t) v0x37; + ksum37 += (uint32_t) v1x37; + ksum37 += (uint32_t) v2x37; + ksum37 += (uint32_t) v3x37; + out[148] = v0x37; + out[149] = v1x37; + out[150] = v2x37; + out[151] = v3x37; + } + if (38 < n) { + const int8_t v0x38 = w0[38]; + const int8_t v1x38 = w1[38]; + const int8_t v2x38 = w2[38]; + const int8_t v3x38 = w3[38]; + ksum38 += (uint32_t) v0x38; + ksum38 += (uint32_t) v1x38; + ksum38 += (uint32_t) v2x38; + ksum38 += (uint32_t) v3x38; + out[152] = v0x38; + out[153] = v1x38; + out[154] = v2x38; + out[155] = v3x38; + } + if (39 < n) { + const int8_t v0x39 = w0[39]; + const int8_t v1x39 = w1[39]; + const int8_t v2x39 = w2[39]; + const int8_t v3x39 = w3[39]; + ksum39 += (uint32_t) v0x39; + ksum39 += (uint32_t) v1x39; + ksum39 += (uint32_t) v2x39; + ksum39 += (uint32_t) v3x39; + out[156] = v0x39; + out[157] = v1x39; + out[158] = v2x39; + out[159] = v3x39; + } + if (40 < n) { + const int8_t v0x40 = w0[40]; + const int8_t v1x40 = w1[40]; + const int8_t v2x40 = w2[40]; + const int8_t v3x40 = w3[40]; + ksum40 += (uint32_t) v0x40; + ksum40 += (uint32_t) v1x40; + ksum40 += (uint32_t) v2x40; + ksum40 += (uint32_t) v3x40; + out[160] = v0x40; + out[161] = v1x40; + out[162] = v2x40; + out[163] = v3x40; + } + if (41 < n) { + const int8_t v0x41 = w0[41]; + const int8_t v1x41 = w1[41]; + const int8_t v2x41 = w2[41]; + const int8_t v3x41 = w3[41]; + ksum41 += (uint32_t) v0x41; + ksum41 += (uint32_t) v1x41; + ksum41 += (uint32_t) v2x41; + ksum41 += (uint32_t) v3x41; + out[164] = v0x41; + out[165] = v1x41; + out[166] = v2x41; + out[167] = v3x41; + } + if (42 < n) { + const int8_t v0x42 = w0[42]; + const int8_t v1x42 = w1[42]; + const int8_t v2x42 = w2[42]; + const int8_t v3x42 = w3[42]; + ksum42 += (uint32_t) v0x42; + ksum42 += (uint32_t) v1x42; + ksum42 += (uint32_t) v2x42; + ksum42 += (uint32_t) v3x42; + out[168] = v0x42; + out[169] = v1x42; + out[170] = v2x42; + out[171] = v3x42; + } + if (43 < n) { + const int8_t v0x43 = w0[43]; + const int8_t v1x43 = w1[43]; + const int8_t v2x43 = w2[43]; + const int8_t v3x43 = w3[43]; + ksum43 += (uint32_t) v0x43; + ksum43 += (uint32_t) v1x43; + ksum43 += (uint32_t) v2x43; + ksum43 += (uint32_t) v3x43; + out[172] = v0x43; + out[173] = v1x43; + out[174] = v2x43; + out[175] = v3x43; + } + if (44 < n) { + const int8_t v0x44 = w0[44]; + const int8_t v1x44 = w1[44]; + const int8_t v2x44 = w2[44]; + const int8_t v3x44 = w3[44]; + ksum44 += (uint32_t) v0x44; + ksum44 += (uint32_t) v1x44; + ksum44 += (uint32_t) v2x44; + ksum44 += (uint32_t) v3x44; + out[176] = v0x44; + out[177] = v1x44; + out[178] = v2x44; + out[179] = v3x44; + } + if (45 < n) { + const int8_t v0x45 = w0[45]; + const int8_t v1x45 = w1[45]; + const int8_t v2x45 = w2[45]; + const int8_t v3x45 = w3[45]; + ksum45 += (uint32_t) v0x45; + ksum45 += (uint32_t) v1x45; + ksum45 += (uint32_t) v2x45; + ksum45 += (uint32_t) v3x45; + out[180] = v0x45; + out[181] = v1x45; + out[182] = v2x45; + out[183] = v3x45; + } + if (46 < n) { + const int8_t v0x46 = w0[46]; + const int8_t v1x46 = w1[46]; + const int8_t v2x46 = w2[46]; + const int8_t v3x46 = w3[46]; + ksum46 += (uint32_t) v0x46; + ksum46 += (uint32_t) v1x46; + ksum46 += (uint32_t) v2x46; + ksum46 += (uint32_t) v3x46; + out[184] = v0x46; + out[185] = v1x46; + out[186] = v2x46; + out[187] = v3x46; + } + if (47 < n) { + const int8_t v0x47 = w0[47]; + const int8_t v1x47 = w1[47]; + const int8_t v2x47 = w2[47]; + const int8_t v3x47 = w3[47]; + ksum47 += (uint32_t) v0x47; + ksum47 += (uint32_t) v1x47; + ksum47 += (uint32_t) v2x47; + ksum47 += (uint32_t) v3x47; + out[188] = v0x47; + out[189] = v1x47; + out[190] = v2x47; + out[191] = v3x47; + } + if (48 < n) { + const int8_t v0x48 = w0[48]; + const int8_t v1x48 = w1[48]; + const int8_t v2x48 = w2[48]; + const int8_t v3x48 = w3[48]; + ksum48 += (uint32_t) v0x48; + ksum48 += (uint32_t) v1x48; + ksum48 += (uint32_t) v2x48; + ksum48 += (uint32_t) v3x48; + out[192] = v0x48; + out[193] = v1x48; + out[194] = v2x48; + out[195] = v3x48; + } + if (49 < n) { + const int8_t v0x49 = w0[49]; + const int8_t v1x49 = w1[49]; + const int8_t v2x49 = w2[49]; + const int8_t v3x49 = w3[49]; + ksum49 += (uint32_t) v0x49; + ksum49 += (uint32_t) v1x49; + ksum49 += (uint32_t) v2x49; + ksum49 += (uint32_t) v3x49; + out[196] = v0x49; + out[197] = v1x49; + out[198] = v2x49; + out[199] = v3x49; + } + if (50 < n) { + const int8_t v0x50 = w0[50]; + const int8_t v1x50 = w1[50]; + const int8_t v2x50 = w2[50]; + const int8_t v3x50 = w3[50]; + ksum50 += (uint32_t) v0x50; + ksum50 += (uint32_t) v1x50; + ksum50 += (uint32_t) v2x50; + ksum50 += (uint32_t) v3x50; + out[200] = v0x50; + out[201] = v1x50; + out[202] = v2x50; + out[203] = v3x50; + } + if (51 < n) { + const int8_t v0x51 = w0[51]; + const int8_t v1x51 = w1[51]; + const int8_t v2x51 = w2[51]; + const int8_t v3x51 = w3[51]; + ksum51 += (uint32_t) v0x51; + ksum51 += (uint32_t) v1x51; + ksum51 += (uint32_t) v2x51; + ksum51 += (uint32_t) v3x51; + out[204] = v0x51; + out[205] = v1x51; + out[206] = v2x51; + out[207] = v3x51; + } + if (52 < n) { + const int8_t v0x52 = w0[52]; + const int8_t v1x52 = w1[52]; + const int8_t v2x52 = w2[52]; + const int8_t v3x52 = w3[52]; + ksum52 += (uint32_t) v0x52; + ksum52 += (uint32_t) v1x52; + ksum52 += (uint32_t) v2x52; + ksum52 += (uint32_t) v3x52; + out[208] = v0x52; + out[209] = v1x52; + out[210] = v2x52; + out[211] = v3x52; + } + if (53 < n) { + const int8_t v0x53 = w0[53]; + const int8_t v1x53 = w1[53]; + const int8_t v2x53 = w2[53]; + const int8_t v3x53 = w3[53]; + ksum53 += (uint32_t) v0x53; + ksum53 += (uint32_t) v1x53; + ksum53 += (uint32_t) v2x53; + ksum53 += (uint32_t) v3x53; + out[212] = v0x53; + out[213] = v1x53; + out[214] = v2x53; + out[215] = v3x53; + } + if (54 < n) { + const int8_t v0x54 = w0[54]; + const int8_t v1x54 = w1[54]; + const int8_t v2x54 = w2[54]; + const int8_t v3x54 = w3[54]; + ksum54 += (uint32_t) v0x54; + ksum54 += (uint32_t) v1x54; + ksum54 += (uint32_t) v2x54; + ksum54 += (uint32_t) v3x54; + out[216] = v0x54; + out[217] = v1x54; + out[218] = v2x54; + out[219] = v3x54; + } + if (55 < n) { + const int8_t v0x55 = w0[55]; + const int8_t v1x55 = w1[55]; + const int8_t v2x55 = w2[55]; + const int8_t v3x55 = w3[55]; + ksum55 += (uint32_t) v0x55; + ksum55 += (uint32_t) v1x55; + ksum55 += (uint32_t) v2x55; + ksum55 += (uint32_t) v3x55; + out[220] = v0x55; + out[221] = v1x55; + out[222] = v2x55; + out[223] = v3x55; + } + if (56 < n) { + const int8_t v0x56 = w0[56]; + const int8_t v1x56 = w1[56]; + const int8_t v2x56 = w2[56]; + const int8_t v3x56 = w3[56]; + ksum56 += (uint32_t) v0x56; + ksum56 += (uint32_t) v1x56; + ksum56 += (uint32_t) v2x56; + ksum56 += (uint32_t) v3x56; + out[224] = v0x56; + out[225] = v1x56; + out[226] = v2x56; + out[227] = v3x56; + } + if (57 < n) { + const int8_t v0x57 = w0[57]; + const int8_t v1x57 = w1[57]; + const int8_t v2x57 = w2[57]; + const int8_t v3x57 = w3[57]; + ksum57 += (uint32_t) v0x57; + ksum57 += (uint32_t) v1x57; + ksum57 += (uint32_t) v2x57; + ksum57 += (uint32_t) v3x57; + out[228] = v0x57; + out[229] = v1x57; + out[230] = v2x57; + out[231] = v3x57; + } + if (58 < n) { + const int8_t v0x58 = w0[58]; + const int8_t v1x58 = w1[58]; + const int8_t v2x58 = w2[58]; + const int8_t v3x58 = w3[58]; + ksum58 += (uint32_t) v0x58; + ksum58 += (uint32_t) v1x58; + ksum58 += (uint32_t) v2x58; + ksum58 += (uint32_t) v3x58; + out[232] = v0x58; + out[233] = v1x58; + out[234] = v2x58; + out[235] = v3x58; + } + if (59 < n) { + const int8_t v0x59 = w0[59]; + const int8_t v1x59 = w1[59]; + const int8_t v2x59 = w2[59]; + const int8_t v3x59 = w3[59]; + ksum59 += (uint32_t) v0x59; + ksum59 += (uint32_t) v1x59; + ksum59 += (uint32_t) v2x59; + ksum59 += (uint32_t) v3x59; + out[236] = v0x59; + out[237] = v1x59; + out[238] = v2x59; + out[239] = v3x59; + } + if (60 < n) { + const int8_t v0x60 = w0[60]; + const int8_t v1x60 = w1[60]; + const int8_t v2x60 = w2[60]; + const int8_t v3x60 = w3[60]; + ksum60 += (uint32_t) v0x60; + ksum60 += (uint32_t) v1x60; + ksum60 += (uint32_t) v2x60; + ksum60 += (uint32_t) v3x60; + out[240] = v0x60; + out[241] = v1x60; + out[242] = v2x60; + out[243] = v3x60; + } + if (61 < n) { + const int8_t v0x61 = w0[61]; + const int8_t v1x61 = w1[61]; + const int8_t v2x61 = w2[61]; + const int8_t v3x61 = w3[61]; + ksum61 += (uint32_t) v0x61; + ksum61 += (uint32_t) v1x61; + ksum61 += (uint32_t) v2x61; + ksum61 += (uint32_t) v3x61; + out[244] = v0x61; + out[245] = v1x61; + out[246] = v2x61; + out[247] = v3x61; + } + if (62 < n) { + const int8_t v0x62 = w0[62]; + const int8_t v1x62 = w1[62]; + const int8_t v2x62 = w2[62]; + const int8_t v3x62 = w3[62]; + ksum62 += (uint32_t) v0x62; + ksum62 += (uint32_t) v1x62; + ksum62 += (uint32_t) v2x62; + ksum62 += (uint32_t) v3x62; + out[248] = v0x62; + out[249] = v1x62; + out[250] = v2x62; + out[251] = v3x62; + } + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 256; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[32] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[33] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[34] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[35] = v3x8; + } + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[36] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[37] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[38] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[39] = v3x9; + } + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[40] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[41] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[42] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[43] = v3x10; + } + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[44] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[45] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[46] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[47] = v3x11; + } + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[48] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[49] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[50] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[51] = v3x12; + } + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[52] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[53] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[54] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[55] = v3x13; + } + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[56] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[57] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[58] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[59] = v3x14; + } + } + if (15 < n) { + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[60] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[61] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[62] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[63] = v3x15; + } + } + if (16 < n) { + const int8_t v0x16 = w0[16]; + ksum16 += (uint32_t) v0x16; + out[64] = v0x16; + if (1 < k) { + const int8_t v1x16 = w1[16]; + ksum16 += (uint32_t) v1x16; + out[65] = v1x16; + } + if (2 < k) { + const int8_t v2x16 = w2[16]; + ksum16 += (uint32_t) v2x16; + out[66] = v2x16; + } + if (3 < k) { + const int8_t v3x16 = w3[16]; + ksum16 += (uint32_t) v3x16; + out[67] = v3x16; + } + } + if (17 < n) { + const int8_t v0x17 = w0[17]; + ksum17 += (uint32_t) v0x17; + out[68] = v0x17; + if (1 < k) { + const int8_t v1x17 = w1[17]; + ksum17 += (uint32_t) v1x17; + out[69] = v1x17; + } + if (2 < k) { + const int8_t v2x17 = w2[17]; + ksum17 += (uint32_t) v2x17; + out[70] = v2x17; + } + if (3 < k) { + const int8_t v3x17 = w3[17]; + ksum17 += (uint32_t) v3x17; + out[71] = v3x17; + } + } + if (18 < n) { + const int8_t v0x18 = w0[18]; + ksum18 += (uint32_t) v0x18; + out[72] = v0x18; + if (1 < k) { + const int8_t v1x18 = w1[18]; + ksum18 += (uint32_t) v1x18; + out[73] = v1x18; + } + if (2 < k) { + const int8_t v2x18 = w2[18]; + ksum18 += (uint32_t) v2x18; + out[74] = v2x18; + } + if (3 < k) { + const int8_t v3x18 = w3[18]; + ksum18 += (uint32_t) v3x18; + out[75] = v3x18; + } + } + if (19 < n) { + const int8_t v0x19 = w0[19]; + ksum19 += (uint32_t) v0x19; + out[76] = v0x19; + if (1 < k) { + const int8_t v1x19 = w1[19]; + ksum19 += (uint32_t) v1x19; + out[77] = v1x19; + } + if (2 < k) { + const int8_t v2x19 = w2[19]; + ksum19 += (uint32_t) v2x19; + out[78] = v2x19; + } + if (3 < k) { + const int8_t v3x19 = w3[19]; + ksum19 += (uint32_t) v3x19; + out[79] = v3x19; + } + } + if (20 < n) { + const int8_t v0x20 = w0[20]; + ksum20 += (uint32_t) v0x20; + out[80] = v0x20; + if (1 < k) { + const int8_t v1x20 = w1[20]; + ksum20 += (uint32_t) v1x20; + out[81] = v1x20; + } + if (2 < k) { + const int8_t v2x20 = w2[20]; + ksum20 += (uint32_t) v2x20; + out[82] = v2x20; + } + if (3 < k) { + const int8_t v3x20 = w3[20]; + ksum20 += (uint32_t) v3x20; + out[83] = v3x20; + } + } + if (21 < n) { + const int8_t v0x21 = w0[21]; + ksum21 += (uint32_t) v0x21; + out[84] = v0x21; + if (1 < k) { + const int8_t v1x21 = w1[21]; + ksum21 += (uint32_t) v1x21; + out[85] = v1x21; + } + if (2 < k) { + const int8_t v2x21 = w2[21]; + ksum21 += (uint32_t) v2x21; + out[86] = v2x21; + } + if (3 < k) { + const int8_t v3x21 = w3[21]; + ksum21 += (uint32_t) v3x21; + out[87] = v3x21; + } + } + if (22 < n) { + const int8_t v0x22 = w0[22]; + ksum22 += (uint32_t) v0x22; + out[88] = v0x22; + if (1 < k) { + const int8_t v1x22 = w1[22]; + ksum22 += (uint32_t) v1x22; + out[89] = v1x22; + } + if (2 < k) { + const int8_t v2x22 = w2[22]; + ksum22 += (uint32_t) v2x22; + out[90] = v2x22; + } + if (3 < k) { + const int8_t v3x22 = w3[22]; + ksum22 += (uint32_t) v3x22; + out[91] = v3x22; + } + } + if (23 < n) { + const int8_t v0x23 = w0[23]; + ksum23 += (uint32_t) v0x23; + out[92] = v0x23; + if (1 < k) { + const int8_t v1x23 = w1[23]; + ksum23 += (uint32_t) v1x23; + out[93] = v1x23; + } + if (2 < k) { + const int8_t v2x23 = w2[23]; + ksum23 += (uint32_t) v2x23; + out[94] = v2x23; + } + if (3 < k) { + const int8_t v3x23 = w3[23]; + ksum23 += (uint32_t) v3x23; + out[95] = v3x23; + } + } + if (24 < n) { + const int8_t v0x24 = w0[24]; + ksum24 += (uint32_t) v0x24; + out[96] = v0x24; + if (1 < k) { + const int8_t v1x24 = w1[24]; + ksum24 += (uint32_t) v1x24; + out[97] = v1x24; + } + if (2 < k) { + const int8_t v2x24 = w2[24]; + ksum24 += (uint32_t) v2x24; + out[98] = v2x24; + } + if (3 < k) { + const int8_t v3x24 = w3[24]; + ksum24 += (uint32_t) v3x24; + out[99] = v3x24; + } + } + if (25 < n) { + const int8_t v0x25 = w0[25]; + ksum25 += (uint32_t) v0x25; + out[100] = v0x25; + if (1 < k) { + const int8_t v1x25 = w1[25]; + ksum25 += (uint32_t) v1x25; + out[101] = v1x25; + } + if (2 < k) { + const int8_t v2x25 = w2[25]; + ksum25 += (uint32_t) v2x25; + out[102] = v2x25; + } + if (3 < k) { + const int8_t v3x25 = w3[25]; + ksum25 += (uint32_t) v3x25; + out[103] = v3x25; + } + } + if (26 < n) { + const int8_t v0x26 = w0[26]; + ksum26 += (uint32_t) v0x26; + out[104] = v0x26; + if (1 < k) { + const int8_t v1x26 = w1[26]; + ksum26 += (uint32_t) v1x26; + out[105] = v1x26; + } + if (2 < k) { + const int8_t v2x26 = w2[26]; + ksum26 += (uint32_t) v2x26; + out[106] = v2x26; + } + if (3 < k) { + const int8_t v3x26 = w3[26]; + ksum26 += (uint32_t) v3x26; + out[107] = v3x26; + } + } + if (27 < n) { + const int8_t v0x27 = w0[27]; + ksum27 += (uint32_t) v0x27; + out[108] = v0x27; + if (1 < k) { + const int8_t v1x27 = w1[27]; + ksum27 += (uint32_t) v1x27; + out[109] = v1x27; + } + if (2 < k) { + const int8_t v2x27 = w2[27]; + ksum27 += (uint32_t) v2x27; + out[110] = v2x27; + } + if (3 < k) { + const int8_t v3x27 = w3[27]; + ksum27 += (uint32_t) v3x27; + out[111] = v3x27; + } + } + if (28 < n) { + const int8_t v0x28 = w0[28]; + ksum28 += (uint32_t) v0x28; + out[112] = v0x28; + if (1 < k) { + const int8_t v1x28 = w1[28]; + ksum28 += (uint32_t) v1x28; + out[113] = v1x28; + } + if (2 < k) { + const int8_t v2x28 = w2[28]; + ksum28 += (uint32_t) v2x28; + out[114] = v2x28; + } + if (3 < k) { + const int8_t v3x28 = w3[28]; + ksum28 += (uint32_t) v3x28; + out[115] = v3x28; + } + } + if (29 < n) { + const int8_t v0x29 = w0[29]; + ksum29 += (uint32_t) v0x29; + out[116] = v0x29; + if (1 < k) { + const int8_t v1x29 = w1[29]; + ksum29 += (uint32_t) v1x29; + out[117] = v1x29; + } + if (2 < k) { + const int8_t v2x29 = w2[29]; + ksum29 += (uint32_t) v2x29; + out[118] = v2x29; + } + if (3 < k) { + const int8_t v3x29 = w3[29]; + ksum29 += (uint32_t) v3x29; + out[119] = v3x29; + } + } + if (30 < n) { + const int8_t v0x30 = w0[30]; + ksum30 += (uint32_t) v0x30; + out[120] = v0x30; + if (1 < k) { + const int8_t v1x30 = w1[30]; + ksum30 += (uint32_t) v1x30; + out[121] = v1x30; + } + if (2 < k) { + const int8_t v2x30 = w2[30]; + ksum30 += (uint32_t) v2x30; + out[122] = v2x30; + } + if (3 < k) { + const int8_t v3x30 = w3[30]; + ksum30 += (uint32_t) v3x30; + out[123] = v3x30; + } + } + if (31 < n) { + const int8_t v0x31 = w0[31]; + ksum31 += (uint32_t) v0x31; + out[124] = v0x31; + if (1 < k) { + const int8_t v1x31 = w1[31]; + ksum31 += (uint32_t) v1x31; + out[125] = v1x31; + } + if (2 < k) { + const int8_t v2x31 = w2[31]; + ksum31 += (uint32_t) v2x31; + out[126] = v2x31; + } + if (3 < k) { + const int8_t v3x31 = w3[31]; + ksum31 += (uint32_t) v3x31; + out[127] = v3x31; + } + } + if (32 < n) { + const int8_t v0x32 = w0[32]; + ksum32 += (uint32_t) v0x32; + out[128] = v0x32; + if (1 < k) { + const int8_t v1x32 = w1[32]; + ksum32 += (uint32_t) v1x32; + out[129] = v1x32; + } + if (2 < k) { + const int8_t v2x32 = w2[32]; + ksum32 += (uint32_t) v2x32; + out[130] = v2x32; + } + if (3 < k) { + const int8_t v3x32 = w3[32]; + ksum32 += (uint32_t) v3x32; + out[131] = v3x32; + } + } + if (33 < n) { + const int8_t v0x33 = w0[33]; + ksum33 += (uint32_t) v0x33; + out[132] = v0x33; + if (1 < k) { + const int8_t v1x33 = w1[33]; + ksum33 += (uint32_t) v1x33; + out[133] = v1x33; + } + if (2 < k) { + const int8_t v2x33 = w2[33]; + ksum33 += (uint32_t) v2x33; + out[134] = v2x33; + } + if (3 < k) { + const int8_t v3x33 = w3[33]; + ksum33 += (uint32_t) v3x33; + out[135] = v3x33; + } + } + if (34 < n) { + const int8_t v0x34 = w0[34]; + ksum34 += (uint32_t) v0x34; + out[136] = v0x34; + if (1 < k) { + const int8_t v1x34 = w1[34]; + ksum34 += (uint32_t) v1x34; + out[137] = v1x34; + } + if (2 < k) { + const int8_t v2x34 = w2[34]; + ksum34 += (uint32_t) v2x34; + out[138] = v2x34; + } + if (3 < k) { + const int8_t v3x34 = w3[34]; + ksum34 += (uint32_t) v3x34; + out[139] = v3x34; + } + } + if (35 < n) { + const int8_t v0x35 = w0[35]; + ksum35 += (uint32_t) v0x35; + out[140] = v0x35; + if (1 < k) { + const int8_t v1x35 = w1[35]; + ksum35 += (uint32_t) v1x35; + out[141] = v1x35; + } + if (2 < k) { + const int8_t v2x35 = w2[35]; + ksum35 += (uint32_t) v2x35; + out[142] = v2x35; + } + if (3 < k) { + const int8_t v3x35 = w3[35]; + ksum35 += (uint32_t) v3x35; + out[143] = v3x35; + } + } + if (36 < n) { + const int8_t v0x36 = w0[36]; + ksum36 += (uint32_t) v0x36; + out[144] = v0x36; + if (1 < k) { + const int8_t v1x36 = w1[36]; + ksum36 += (uint32_t) v1x36; + out[145] = v1x36; + } + if (2 < k) { + const int8_t v2x36 = w2[36]; + ksum36 += (uint32_t) v2x36; + out[146] = v2x36; + } + if (3 < k) { + const int8_t v3x36 = w3[36]; + ksum36 += (uint32_t) v3x36; + out[147] = v3x36; + } + } + if (37 < n) { + const int8_t v0x37 = w0[37]; + ksum37 += (uint32_t) v0x37; + out[148] = v0x37; + if (1 < k) { + const int8_t v1x37 = w1[37]; + ksum37 += (uint32_t) v1x37; + out[149] = v1x37; + } + if (2 < k) { + const int8_t v2x37 = w2[37]; + ksum37 += (uint32_t) v2x37; + out[150] = v2x37; + } + if (3 < k) { + const int8_t v3x37 = w3[37]; + ksum37 += (uint32_t) v3x37; + out[151] = v3x37; + } + } + if (38 < n) { + const int8_t v0x38 = w0[38]; + ksum38 += (uint32_t) v0x38; + out[152] = v0x38; + if (1 < k) { + const int8_t v1x38 = w1[38]; + ksum38 += (uint32_t) v1x38; + out[153] = v1x38; + } + if (2 < k) { + const int8_t v2x38 = w2[38]; + ksum38 += (uint32_t) v2x38; + out[154] = v2x38; + } + if (3 < k) { + const int8_t v3x38 = w3[38]; + ksum38 += (uint32_t) v3x38; + out[155] = v3x38; + } + } + if (39 < n) { + const int8_t v0x39 = w0[39]; + ksum39 += (uint32_t) v0x39; + out[156] = v0x39; + if (1 < k) { + const int8_t v1x39 = w1[39]; + ksum39 += (uint32_t) v1x39; + out[157] = v1x39; + } + if (2 < k) { + const int8_t v2x39 = w2[39]; + ksum39 += (uint32_t) v2x39; + out[158] = v2x39; + } + if (3 < k) { + const int8_t v3x39 = w3[39]; + ksum39 += (uint32_t) v3x39; + out[159] = v3x39; + } + } + if (40 < n) { + const int8_t v0x40 = w0[40]; + ksum40 += (uint32_t) v0x40; + out[160] = v0x40; + if (1 < k) { + const int8_t v1x40 = w1[40]; + ksum40 += (uint32_t) v1x40; + out[161] = v1x40; + } + if (2 < k) { + const int8_t v2x40 = w2[40]; + ksum40 += (uint32_t) v2x40; + out[162] = v2x40; + } + if (3 < k) { + const int8_t v3x40 = w3[40]; + ksum40 += (uint32_t) v3x40; + out[163] = v3x40; + } + } + if (41 < n) { + const int8_t v0x41 = w0[41]; + ksum41 += (uint32_t) v0x41; + out[164] = v0x41; + if (1 < k) { + const int8_t v1x41 = w1[41]; + ksum41 += (uint32_t) v1x41; + out[165] = v1x41; + } + if (2 < k) { + const int8_t v2x41 = w2[41]; + ksum41 += (uint32_t) v2x41; + out[166] = v2x41; + } + if (3 < k) { + const int8_t v3x41 = w3[41]; + ksum41 += (uint32_t) v3x41; + out[167] = v3x41; + } + } + if (42 < n) { + const int8_t v0x42 = w0[42]; + ksum42 += (uint32_t) v0x42; + out[168] = v0x42; + if (1 < k) { + const int8_t v1x42 = w1[42]; + ksum42 += (uint32_t) v1x42; + out[169] = v1x42; + } + if (2 < k) { + const int8_t v2x42 = w2[42]; + ksum42 += (uint32_t) v2x42; + out[170] = v2x42; + } + if (3 < k) { + const int8_t v3x42 = w3[42]; + ksum42 += (uint32_t) v3x42; + out[171] = v3x42; + } + } + if (43 < n) { + const int8_t v0x43 = w0[43]; + ksum43 += (uint32_t) v0x43; + out[172] = v0x43; + if (1 < k) { + const int8_t v1x43 = w1[43]; + ksum43 += (uint32_t) v1x43; + out[173] = v1x43; + } + if (2 < k) { + const int8_t v2x43 = w2[43]; + ksum43 += (uint32_t) v2x43; + out[174] = v2x43; + } + if (3 < k) { + const int8_t v3x43 = w3[43]; + ksum43 += (uint32_t) v3x43; + out[175] = v3x43; + } + } + if (44 < n) { + const int8_t v0x44 = w0[44]; + ksum44 += (uint32_t) v0x44; + out[176] = v0x44; + if (1 < k) { + const int8_t v1x44 = w1[44]; + ksum44 += (uint32_t) v1x44; + out[177] = v1x44; + } + if (2 < k) { + const int8_t v2x44 = w2[44]; + ksum44 += (uint32_t) v2x44; + out[178] = v2x44; + } + if (3 < k) { + const int8_t v3x44 = w3[44]; + ksum44 += (uint32_t) v3x44; + out[179] = v3x44; + } + } + if (45 < n) { + const int8_t v0x45 = w0[45]; + ksum45 += (uint32_t) v0x45; + out[180] = v0x45; + if (1 < k) { + const int8_t v1x45 = w1[45]; + ksum45 += (uint32_t) v1x45; + out[181] = v1x45; + } + if (2 < k) { + const int8_t v2x45 = w2[45]; + ksum45 += (uint32_t) v2x45; + out[182] = v2x45; + } + if (3 < k) { + const int8_t v3x45 = w3[45]; + ksum45 += (uint32_t) v3x45; + out[183] = v3x45; + } + } + if (46 < n) { + const int8_t v0x46 = w0[46]; + ksum46 += (uint32_t) v0x46; + out[184] = v0x46; + if (1 < k) { + const int8_t v1x46 = w1[46]; + ksum46 += (uint32_t) v1x46; + out[185] = v1x46; + } + if (2 < k) { + const int8_t v2x46 = w2[46]; + ksum46 += (uint32_t) v2x46; + out[186] = v2x46; + } + if (3 < k) { + const int8_t v3x46 = w3[46]; + ksum46 += (uint32_t) v3x46; + out[187] = v3x46; + } + } + if (47 < n) { + const int8_t v0x47 = w0[47]; + ksum47 += (uint32_t) v0x47; + out[188] = v0x47; + if (1 < k) { + const int8_t v1x47 = w1[47]; + ksum47 += (uint32_t) v1x47; + out[189] = v1x47; + } + if (2 < k) { + const int8_t v2x47 = w2[47]; + ksum47 += (uint32_t) v2x47; + out[190] = v2x47; + } + if (3 < k) { + const int8_t v3x47 = w3[47]; + ksum47 += (uint32_t) v3x47; + out[191] = v3x47; + } + } + if (48 < n) { + const int8_t v0x48 = w0[48]; + ksum48 += (uint32_t) v0x48; + out[192] = v0x48; + if (1 < k) { + const int8_t v1x48 = w1[48]; + ksum48 += (uint32_t) v1x48; + out[193] = v1x48; + } + if (2 < k) { + const int8_t v2x48 = w2[48]; + ksum48 += (uint32_t) v2x48; + out[194] = v2x48; + } + if (3 < k) { + const int8_t v3x48 = w3[48]; + ksum48 += (uint32_t) v3x48; + out[195] = v3x48; + } + } + if (49 < n) { + const int8_t v0x49 = w0[49]; + ksum49 += (uint32_t) v0x49; + out[196] = v0x49; + if (1 < k) { + const int8_t v1x49 = w1[49]; + ksum49 += (uint32_t) v1x49; + out[197] = v1x49; + } + if (2 < k) { + const int8_t v2x49 = w2[49]; + ksum49 += (uint32_t) v2x49; + out[198] = v2x49; + } + if (3 < k) { + const int8_t v3x49 = w3[49]; + ksum49 += (uint32_t) v3x49; + out[199] = v3x49; + } + } + if (50 < n) { + const int8_t v0x50 = w0[50]; + ksum50 += (uint32_t) v0x50; + out[200] = v0x50; + if (1 < k) { + const int8_t v1x50 = w1[50]; + ksum50 += (uint32_t) v1x50; + out[201] = v1x50; + } + if (2 < k) { + const int8_t v2x50 = w2[50]; + ksum50 += (uint32_t) v2x50; + out[202] = v2x50; + } + if (3 < k) { + const int8_t v3x50 = w3[50]; + ksum50 += (uint32_t) v3x50; + out[203] = v3x50; + } + } + if (51 < n) { + const int8_t v0x51 = w0[51]; + ksum51 += (uint32_t) v0x51; + out[204] = v0x51; + if (1 < k) { + const int8_t v1x51 = w1[51]; + ksum51 += (uint32_t) v1x51; + out[205] = v1x51; + } + if (2 < k) { + const int8_t v2x51 = w2[51]; + ksum51 += (uint32_t) v2x51; + out[206] = v2x51; + } + if (3 < k) { + const int8_t v3x51 = w3[51]; + ksum51 += (uint32_t) v3x51; + out[207] = v3x51; + } + } + if (52 < n) { + const int8_t v0x52 = w0[52]; + ksum52 += (uint32_t) v0x52; + out[208] = v0x52; + if (1 < k) { + const int8_t v1x52 = w1[52]; + ksum52 += (uint32_t) v1x52; + out[209] = v1x52; + } + if (2 < k) { + const int8_t v2x52 = w2[52]; + ksum52 += (uint32_t) v2x52; + out[210] = v2x52; + } + if (3 < k) { + const int8_t v3x52 = w3[52]; + ksum52 += (uint32_t) v3x52; + out[211] = v3x52; + } + } + if (53 < n) { + const int8_t v0x53 = w0[53]; + ksum53 += (uint32_t) v0x53; + out[212] = v0x53; + if (1 < k) { + const int8_t v1x53 = w1[53]; + ksum53 += (uint32_t) v1x53; + out[213] = v1x53; + } + if (2 < k) { + const int8_t v2x53 = w2[53]; + ksum53 += (uint32_t) v2x53; + out[214] = v2x53; + } + if (3 < k) { + const int8_t v3x53 = w3[53]; + ksum53 += (uint32_t) v3x53; + out[215] = v3x53; + } + } + if (54 < n) { + const int8_t v0x54 = w0[54]; + ksum54 += (uint32_t) v0x54; + out[216] = v0x54; + if (1 < k) { + const int8_t v1x54 = w1[54]; + ksum54 += (uint32_t) v1x54; + out[217] = v1x54; + } + if (2 < k) { + const int8_t v2x54 = w2[54]; + ksum54 += (uint32_t) v2x54; + out[218] = v2x54; + } + if (3 < k) { + const int8_t v3x54 = w3[54]; + ksum54 += (uint32_t) v3x54; + out[219] = v3x54; + } + } + if (55 < n) { + const int8_t v0x55 = w0[55]; + ksum55 += (uint32_t) v0x55; + out[220] = v0x55; + if (1 < k) { + const int8_t v1x55 = w1[55]; + ksum55 += (uint32_t) v1x55; + out[221] = v1x55; + } + if (2 < k) { + const int8_t v2x55 = w2[55]; + ksum55 += (uint32_t) v2x55; + out[222] = v2x55; + } + if (3 < k) { + const int8_t v3x55 = w3[55]; + ksum55 += (uint32_t) v3x55; + out[223] = v3x55; + } + } + if (56 < n) { + const int8_t v0x56 = w0[56]; + ksum56 += (uint32_t) v0x56; + out[224] = v0x56; + if (1 < k) { + const int8_t v1x56 = w1[56]; + ksum56 += (uint32_t) v1x56; + out[225] = v1x56; + } + if (2 < k) { + const int8_t v2x56 = w2[56]; + ksum56 += (uint32_t) v2x56; + out[226] = v2x56; + } + if (3 < k) { + const int8_t v3x56 = w3[56]; + ksum56 += (uint32_t) v3x56; + out[227] = v3x56; + } + } + if (57 < n) { + const int8_t v0x57 = w0[57]; + ksum57 += (uint32_t) v0x57; + out[228] = v0x57; + if (1 < k) { + const int8_t v1x57 = w1[57]; + ksum57 += (uint32_t) v1x57; + out[229] = v1x57; + } + if (2 < k) { + const int8_t v2x57 = w2[57]; + ksum57 += (uint32_t) v2x57; + out[230] = v2x57; + } + if (3 < k) { + const int8_t v3x57 = w3[57]; + ksum57 += (uint32_t) v3x57; + out[231] = v3x57; + } + } + if (58 < n) { + const int8_t v0x58 = w0[58]; + ksum58 += (uint32_t) v0x58; + out[232] = v0x58; + if (1 < k) { + const int8_t v1x58 = w1[58]; + ksum58 += (uint32_t) v1x58; + out[233] = v1x58; + } + if (2 < k) { + const int8_t v2x58 = w2[58]; + ksum58 += (uint32_t) v2x58; + out[234] = v2x58; + } + if (3 < k) { + const int8_t v3x58 = w3[58]; + ksum58 += (uint32_t) v3x58; + out[235] = v3x58; + } + } + if (59 < n) { + const int8_t v0x59 = w0[59]; + ksum59 += (uint32_t) v0x59; + out[236] = v0x59; + if (1 < k) { + const int8_t v1x59 = w1[59]; + ksum59 += (uint32_t) v1x59; + out[237] = v1x59; + } + if (2 < k) { + const int8_t v2x59 = w2[59]; + ksum59 += (uint32_t) v2x59; + out[238] = v2x59; + } + if (3 < k) { + const int8_t v3x59 = w3[59]; + ksum59 += (uint32_t) v3x59; + out[239] = v3x59; + } + } + if (60 < n) { + const int8_t v0x60 = w0[60]; + ksum60 += (uint32_t) v0x60; + out[240] = v0x60; + if (1 < k) { + const int8_t v1x60 = w1[60]; + ksum60 += (uint32_t) v1x60; + out[241] = v1x60; + } + if (2 < k) { + const int8_t v2x60 = w2[60]; + ksum60 += (uint32_t) v2x60; + out[242] = v2x60; + } + if (3 < k) { + const int8_t v3x60 = w3[60]; + ksum60 += (uint32_t) v3x60; + out[243] = v3x60; + } + } + if (61 < n) { + const int8_t v0x61 = w0[61]; + ksum61 += (uint32_t) v0x61; + out[244] = v0x61; + if (1 < k) { + const int8_t v1x61 = w1[61]; + ksum61 += (uint32_t) v1x61; + out[245] = v1x61; + } + if (2 < k) { + const int8_t v2x61 = w2[61]; + ksum61 += (uint32_t) v2x61; + out[246] = v2x61; + } + if (3 < k) { + const int8_t v3x61 = w3[61]; + ksum61 += (uint32_t) v3x61; + out[247] = v3x61; + } + } + if (62 < n) { + const int8_t v0x62 = w0[62]; + ksum62 += (uint32_t) v0x62; + out[248] = v0x62; + if (1 < k) { + const int8_t v1x62 = w1[62]; + ksum62 += (uint32_t) v1x62; + out[249] = v1x62; + } + if (2 < k) { + const int8_t v2x62 = w2[62]; + ksum62 += (uint32_t) v2x62; + out[250] = v2x62; + } + if (3 < k) { + const int8_t v3x62 = w3[62]; + ksum62 += (uint32_t) v3x62; + out[251] = v3x62; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 256; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + packed_b[16] -= ksum16 * izp; + packed_b[17] -= ksum17 * izp; + packed_b[18] -= ksum18 * izp; + packed_b[19] -= ksum19 * izp; + packed_b[20] -= ksum20 * izp; + packed_b[21] -= ksum21 * izp; + packed_b[22] -= ksum22 * izp; + packed_b[23] -= ksum23 * izp; + packed_b[24] -= ksum24 * izp; + packed_b[25] -= ksum25 * izp; + packed_b[26] -= ksum26 * izp; + packed_b[27] -= ksum27 * izp; + packed_b[28] -= ksum28 * izp; + packed_b[29] -= ksum29 * izp; + packed_b[30] -= ksum30 * izp; + packed_b[31] -= ksum31 * izp; + packed_b[32] -= ksum32 * izp; + packed_b[33] -= ksum33 * izp; + packed_b[34] -= ksum34 * izp; + packed_b[35] -= ksum35 * izp; + packed_b[36] -= ksum36 * izp; + packed_b[37] -= ksum37 * izp; + packed_b[38] -= ksum38 * izp; + packed_b[39] -= ksum39 * izp; + packed_b[40] -= ksum40 * izp; + packed_b[41] -= ksum41 * izp; + packed_b[42] -= ksum42 * izp; + packed_b[43] -= ksum43 * izp; + packed_b[44] -= ksum44 * izp; + packed_b[45] -= ksum45 * izp; + packed_b[46] -= ksum46 * izp; + packed_b[47] -= ksum47 * izp; + packed_b[48] -= ksum48 * izp; + packed_b[49] -= ksum49 * izp; + packed_b[50] -= ksum50 * izp; + packed_b[51] -= ksum51 * izp; + packed_b[52] -= ksum52 * izp; + packed_b[53] -= ksum53 * izp; + packed_b[54] -= ksum54 * izp; + packed_b[55] -= ksum55 * izp; + packed_b[56] -= ksum56 * izp; + packed_b[57] -= ksum57 * izp; + packed_b[58] -= ksum58 * izp; + packed_b[59] -= ksum59 * izp; + packed_b[60] -= ksum60 * izp; + packed_b[61] -= ksum61 * izp; + packed_b[62] -= ksum62 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c new file mode 100644 index 00000000000..c74e6a9f972 --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x8c4-gemm-gio-scalar.c @@ -0,0 +1,655 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x8c4__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 4); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + b += 8; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + } + out += 8 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + + // KC main loop multiple of 8x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + out[28] = v0x7; + out[29] = v1x7; + out[30] = v2x7; + out[31] = v3x7; + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 32; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[28] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[29] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[30] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[31] = v3x7; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 32; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 8; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(int32_t); + + // NR remainder has less than 8 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + + // KC main loop multiple of 8x4 + size_t k = kc; + for (; k >= 4; k -= 4) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + out[4] = v0x1; + out[5] = v1x1; + out[6] = v2x1; + out[7] = v3x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + out[8] = v0x2; + out[9] = v1x2; + out[10] = v2x2; + out[11] = v3x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + out[12] = v0x3; + out[13] = v1x3; + out[14] = v2x3; + out[15] = v3x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + out[16] = v0x4; + out[17] = v1x4; + out[18] = v2x4; + out[19] = v3x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + out[20] = v0x5; + out[21] = v1x5; + out[22] = v2x5; + out[23] = v3x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + out[24] = v0x6; + out[25] = v1x6; + out[26] = v2x6; + out[27] = v3x6; + } + w0 += 4 * k_stride; + w1 += 4 * k_stride; + w2 += 4 * k_stride; + w3 += 4 * k_stride; + out += 32; + } + + // KC remainder of 1..3 + if (k != 0) { + assert(k >= 1 && k <= 3); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[4] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[5] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[6] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[7] = v3x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[8] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[9] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[10] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[11] = v3x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[12] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[13] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[14] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[15] = v3x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[16] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[17] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[18] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[19] = v3x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[20] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[21] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[22] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[23] = v3x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[24] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[25] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[26] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[27] = v3x6; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + out += 32; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c new file mode 100644 index 00000000000..4c1acd0b6ae --- /dev/null +++ b/src/qs8-packw/gen/qs8-packw-x8c8-gemm-gio-scalar.c @@ -0,0 +1,1159 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_packw_gemm_gio_ukernel_x8c8__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 0): 0); + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + b += 8; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + } + out += 8 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 8; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(int32_t); + + // NR remainder has less than 8 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + } + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-packw/qs8-packw.h b/src/qs8-packw/qs8-packw.h index 24fc85abf05..0fee494bc43 100644 --- a/src/qs8-packw/qs8-packw.h +++ b/src/qs8-packw/qs8-packw.h @@ -15,6 +15,17 @@ XNN_QS8_UKERNEL(0, xnn_qs8_packw_gemm_goi_ukernel_x16c8__scalar, 16, 8, 1, 8, 1, XNN_QS8_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x8c8__scalar, 8, 8, 1, 8, 1, 128) XNN_QS8_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_goi_ukernel_x16c8__scalar, 16, 8, 1, 8, 1, 128) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x8c4__scalar, 8, 4, 1, 4, 1, 0) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x16c4__scalar, 16, 4, 1, 4, 1, 0) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x32c4__scalar, 32, 4, 1, 4, 1, 0) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x64c4__scalar, 64, 4, 1, 4, 1, 0) + +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x8c8__scalar, 8, 8, 1, 8, 1, 0) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_packw_gemm_gio_ukernel_x16c8__scalar, 16, 8, 1, 8, 1, 0) + +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_gio_ukernel_x8c8__scalar, 8, 8, 1, 8, 1, 128) +XNN_QS8_GIO_UKERNEL(0, xnn_qs8_to_qu8_packw_gemm_gio_ukernel_x16c8__scalar, 16, 8, 1, 8, 1, 128) + #if XNN_ENABLE_AVXVNNI && (XNN_ARCH_X86_64 || XNN_ARCH_X86) XNN_QS8_UKERNEL(xnn_arch_x86_avxvnni, xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni, 8, 8, 1, 8, 1, 0) XNN_QS8_UKERNEL(xnn_arch_x86_avxvnni, xnn_qs8_packw_gemm_goi_ukernel_x8c8__avxvnni_prfm, 8, 8, 1, 8, 1, 0) diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c new file mode 100644 index 00000000000..fbf305ebc3a --- /dev/null +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x16c8-gemm-gio-scalar.c @@ -0,0 +1,2231 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_to_qu8_packw_gemm_gio_ukernel_x16c8__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 16); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 128): 128); + + do { + // NC main loop multiple of 16 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 16; n -= 16) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + ((int32_t*) out)[8] = b[8]; + ((int32_t*) out)[9] = b[9]; + ((int32_t*) out)[10] = b[10]; + ((int32_t*) out)[11] = b[11]; + ((int32_t*) out)[12] = b[12]; + ((int32_t*) out)[13] = b[13]; + ((int32_t*) out)[14] = b[14]; + ((int32_t*) out)[15] = b[15]; + b += 16; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + ((int32_t*) out)[8] = 0; + ((int32_t*) out)[9] = 0; + ((int32_t*) out)[10] = 0; + ((int32_t*) out)[11] = 0; + ((int32_t*) out)[12] = 0; + ((int32_t*) out)[13] = 0; + ((int32_t*) out)[14] = 0; + ((int32_t*) out)[15] = 0; + } + out += 16 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + uint32_t ksum15 = 0; + + // KC main loop multiple of 16x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + const int8_t v4x8 = w4[8]; + const int8_t v5x8 = w5[8]; + const int8_t v6x8 = w6[8]; + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + ksum8 += (uint32_t) v4x8; + ksum8 += (uint32_t) v5x8; + ksum8 += (uint32_t) v6x8; + ksum8 += (uint32_t) v7x8; + out[64] = v0x8; + out[65] = v1x8; + out[66] = v2x8; + out[67] = v3x8; + out[68] = v4x8; + out[69] = v5x8; + out[70] = v6x8; + out[71] = v7x8; + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + const int8_t v4x9 = w4[9]; + const int8_t v5x9 = w5[9]; + const int8_t v6x9 = w6[9]; + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + ksum9 += (uint32_t) v4x9; + ksum9 += (uint32_t) v5x9; + ksum9 += (uint32_t) v6x9; + ksum9 += (uint32_t) v7x9; + out[72] = v0x9; + out[73] = v1x9; + out[74] = v2x9; + out[75] = v3x9; + out[76] = v4x9; + out[77] = v5x9; + out[78] = v6x9; + out[79] = v7x9; + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + const int8_t v4x10 = w4[10]; + const int8_t v5x10 = w5[10]; + const int8_t v6x10 = w6[10]; + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + ksum10 += (uint32_t) v4x10; + ksum10 += (uint32_t) v5x10; + ksum10 += (uint32_t) v6x10; + ksum10 += (uint32_t) v7x10; + out[80] = v0x10; + out[81] = v1x10; + out[82] = v2x10; + out[83] = v3x10; + out[84] = v4x10; + out[85] = v5x10; + out[86] = v6x10; + out[87] = v7x10; + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + const int8_t v4x11 = w4[11]; + const int8_t v5x11 = w5[11]; + const int8_t v6x11 = w6[11]; + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + ksum11 += (uint32_t) v4x11; + ksum11 += (uint32_t) v5x11; + ksum11 += (uint32_t) v6x11; + ksum11 += (uint32_t) v7x11; + out[88] = v0x11; + out[89] = v1x11; + out[90] = v2x11; + out[91] = v3x11; + out[92] = v4x11; + out[93] = v5x11; + out[94] = v6x11; + out[95] = v7x11; + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + const int8_t v4x12 = w4[12]; + const int8_t v5x12 = w5[12]; + const int8_t v6x12 = w6[12]; + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + ksum12 += (uint32_t) v4x12; + ksum12 += (uint32_t) v5x12; + ksum12 += (uint32_t) v6x12; + ksum12 += (uint32_t) v7x12; + out[96] = v0x12; + out[97] = v1x12; + out[98] = v2x12; + out[99] = v3x12; + out[100] = v4x12; + out[101] = v5x12; + out[102] = v6x12; + out[103] = v7x12; + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + const int8_t v4x13 = w4[13]; + const int8_t v5x13 = w5[13]; + const int8_t v6x13 = w6[13]; + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + ksum13 += (uint32_t) v4x13; + ksum13 += (uint32_t) v5x13; + ksum13 += (uint32_t) v6x13; + ksum13 += (uint32_t) v7x13; + out[104] = v0x13; + out[105] = v1x13; + out[106] = v2x13; + out[107] = v3x13; + out[108] = v4x13; + out[109] = v5x13; + out[110] = v6x13; + out[111] = v7x13; + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + const int8_t v4x14 = w4[14]; + const int8_t v5x14 = w5[14]; + const int8_t v6x14 = w6[14]; + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + ksum14 += (uint32_t) v4x14; + ksum14 += (uint32_t) v5x14; + ksum14 += (uint32_t) v6x14; + ksum14 += (uint32_t) v7x14; + out[112] = v0x14; + out[113] = v1x14; + out[114] = v2x14; + out[115] = v3x14; + out[116] = v4x14; + out[117] = v5x14; + out[118] = v6x14; + out[119] = v7x14; + const int8_t v0x15 = w0[15]; + const int8_t v1x15 = w1[15]; + const int8_t v2x15 = w2[15]; + const int8_t v3x15 = w3[15]; + const int8_t v4x15 = w4[15]; + const int8_t v5x15 = w5[15]; + const int8_t v6x15 = w6[15]; + const int8_t v7x15 = w7[15]; + ksum15 += (uint32_t) v0x15; + ksum15 += (uint32_t) v1x15; + ksum15 += (uint32_t) v2x15; + ksum15 += (uint32_t) v3x15; + ksum15 += (uint32_t) v4x15; + ksum15 += (uint32_t) v5x15; + ksum15 += (uint32_t) v6x15; + ksum15 += (uint32_t) v7x15; + out[120] = v0x15; + out[121] = v1x15; + out[122] = v2x15; + out[123] = v3x15; + out[124] = v4x15; + out[125] = v5x15; + out[126] = v6x15; + out[127] = v7x15; + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 128; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[64] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[65] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[66] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[67] = v3x8; + } + if (4 < k) { + const int8_t v4x8 = w4[8]; + ksum8 += (uint32_t) v4x8; + out[68] = v4x8; + } + if (5 < k) { + const int8_t v5x8 = w5[8]; + ksum8 += (uint32_t) v5x8; + out[69] = v5x8; + } + if (6 < k) { + const int8_t v6x8 = w6[8]; + ksum8 += (uint32_t) v6x8; + out[70] = v6x8; + } + if (7 < k) { + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v7x8; + out[71] = v7x8; + } + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[72] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[73] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[74] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[75] = v3x9; + } + if (4 < k) { + const int8_t v4x9 = w4[9]; + ksum9 += (uint32_t) v4x9; + out[76] = v4x9; + } + if (5 < k) { + const int8_t v5x9 = w5[9]; + ksum9 += (uint32_t) v5x9; + out[77] = v5x9; + } + if (6 < k) { + const int8_t v6x9 = w6[9]; + ksum9 += (uint32_t) v6x9; + out[78] = v6x9; + } + if (7 < k) { + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v7x9; + out[79] = v7x9; + } + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[80] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[81] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[82] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[83] = v3x10; + } + if (4 < k) { + const int8_t v4x10 = w4[10]; + ksum10 += (uint32_t) v4x10; + out[84] = v4x10; + } + if (5 < k) { + const int8_t v5x10 = w5[10]; + ksum10 += (uint32_t) v5x10; + out[85] = v5x10; + } + if (6 < k) { + const int8_t v6x10 = w6[10]; + ksum10 += (uint32_t) v6x10; + out[86] = v6x10; + } + if (7 < k) { + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v7x10; + out[87] = v7x10; + } + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[88] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[89] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[90] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[91] = v3x11; + } + if (4 < k) { + const int8_t v4x11 = w4[11]; + ksum11 += (uint32_t) v4x11; + out[92] = v4x11; + } + if (5 < k) { + const int8_t v5x11 = w5[11]; + ksum11 += (uint32_t) v5x11; + out[93] = v5x11; + } + if (6 < k) { + const int8_t v6x11 = w6[11]; + ksum11 += (uint32_t) v6x11; + out[94] = v6x11; + } + if (7 < k) { + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v7x11; + out[95] = v7x11; + } + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[96] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[97] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[98] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[99] = v3x12; + } + if (4 < k) { + const int8_t v4x12 = w4[12]; + ksum12 += (uint32_t) v4x12; + out[100] = v4x12; + } + if (5 < k) { + const int8_t v5x12 = w5[12]; + ksum12 += (uint32_t) v5x12; + out[101] = v5x12; + } + if (6 < k) { + const int8_t v6x12 = w6[12]; + ksum12 += (uint32_t) v6x12; + out[102] = v6x12; + } + if (7 < k) { + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v7x12; + out[103] = v7x12; + } + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[104] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[105] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[106] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[107] = v3x13; + } + if (4 < k) { + const int8_t v4x13 = w4[13]; + ksum13 += (uint32_t) v4x13; + out[108] = v4x13; + } + if (5 < k) { + const int8_t v5x13 = w5[13]; + ksum13 += (uint32_t) v5x13; + out[109] = v5x13; + } + if (6 < k) { + const int8_t v6x13 = w6[13]; + ksum13 += (uint32_t) v6x13; + out[110] = v6x13; + } + if (7 < k) { + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v7x13; + out[111] = v7x13; + } + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[112] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[113] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[114] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[115] = v3x14; + } + if (4 < k) { + const int8_t v4x14 = w4[14]; + ksum14 += (uint32_t) v4x14; + out[116] = v4x14; + } + if (5 < k) { + const int8_t v5x14 = w5[14]; + ksum14 += (uint32_t) v5x14; + out[117] = v5x14; + } + if (6 < k) { + const int8_t v6x14 = w6[14]; + ksum14 += (uint32_t) v6x14; + out[118] = v6x14; + } + if (7 < k) { + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v7x14; + out[119] = v7x14; + } + const int8_t v0x15 = w0[15]; + ksum15 += (uint32_t) v0x15; + out[120] = v0x15; + if (1 < k) { + const int8_t v1x15 = w1[15]; + ksum15 += (uint32_t) v1x15; + out[121] = v1x15; + } + if (2 < k) { + const int8_t v2x15 = w2[15]; + ksum15 += (uint32_t) v2x15; + out[122] = v2x15; + } + if (3 < k) { + const int8_t v3x15 = w3[15]; + ksum15 += (uint32_t) v3x15; + out[123] = v3x15; + } + if (4 < k) { + const int8_t v4x15 = w4[15]; + ksum15 += (uint32_t) v4x15; + out[124] = v4x15; + } + if (5 < k) { + const int8_t v5x15 = w5[15]; + ksum15 += (uint32_t) v5x15; + out[125] = v5x15; + } + if (6 < k) { + const int8_t v6x15 = w6[15]; + ksum15 += (uint32_t) v6x15; + out[126] = v6x15; + } + if (7 < k) { + const int8_t v7x15 = w7[15]; + ksum15 += (uint32_t) v7x15; + out[127] = v7x15; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + packed_b[15] -= ksum15 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 16; + } + + // NC remainder (1..15) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (16 - n) * sizeof(int32_t); + + // NR remainder has less than 16 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + uint32_t ksum8 = 0; + uint32_t ksum9 = 0; + uint32_t ksum10 = 0; + uint32_t ksum11 = 0; + uint32_t ksum12 = 0; + uint32_t ksum13 = 0; + uint32_t ksum14 = 0; + + // KC main loop multiple of 16x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + const int8_t v1x8 = w1[8]; + const int8_t v2x8 = w2[8]; + const int8_t v3x8 = w3[8]; + const int8_t v4x8 = w4[8]; + const int8_t v5x8 = w5[8]; + const int8_t v6x8 = w6[8]; + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v0x8; + ksum8 += (uint32_t) v1x8; + ksum8 += (uint32_t) v2x8; + ksum8 += (uint32_t) v3x8; + ksum8 += (uint32_t) v4x8; + ksum8 += (uint32_t) v5x8; + ksum8 += (uint32_t) v6x8; + ksum8 += (uint32_t) v7x8; + out[64] = v0x8; + out[65] = v1x8; + out[66] = v2x8; + out[67] = v3x8; + out[68] = v4x8; + out[69] = v5x8; + out[70] = v6x8; + out[71] = v7x8; + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + const int8_t v1x9 = w1[9]; + const int8_t v2x9 = w2[9]; + const int8_t v3x9 = w3[9]; + const int8_t v4x9 = w4[9]; + const int8_t v5x9 = w5[9]; + const int8_t v6x9 = w6[9]; + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v0x9; + ksum9 += (uint32_t) v1x9; + ksum9 += (uint32_t) v2x9; + ksum9 += (uint32_t) v3x9; + ksum9 += (uint32_t) v4x9; + ksum9 += (uint32_t) v5x9; + ksum9 += (uint32_t) v6x9; + ksum9 += (uint32_t) v7x9; + out[72] = v0x9; + out[73] = v1x9; + out[74] = v2x9; + out[75] = v3x9; + out[76] = v4x9; + out[77] = v5x9; + out[78] = v6x9; + out[79] = v7x9; + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + const int8_t v1x10 = w1[10]; + const int8_t v2x10 = w2[10]; + const int8_t v3x10 = w3[10]; + const int8_t v4x10 = w4[10]; + const int8_t v5x10 = w5[10]; + const int8_t v6x10 = w6[10]; + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v0x10; + ksum10 += (uint32_t) v1x10; + ksum10 += (uint32_t) v2x10; + ksum10 += (uint32_t) v3x10; + ksum10 += (uint32_t) v4x10; + ksum10 += (uint32_t) v5x10; + ksum10 += (uint32_t) v6x10; + ksum10 += (uint32_t) v7x10; + out[80] = v0x10; + out[81] = v1x10; + out[82] = v2x10; + out[83] = v3x10; + out[84] = v4x10; + out[85] = v5x10; + out[86] = v6x10; + out[87] = v7x10; + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + const int8_t v1x11 = w1[11]; + const int8_t v2x11 = w2[11]; + const int8_t v3x11 = w3[11]; + const int8_t v4x11 = w4[11]; + const int8_t v5x11 = w5[11]; + const int8_t v6x11 = w6[11]; + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v0x11; + ksum11 += (uint32_t) v1x11; + ksum11 += (uint32_t) v2x11; + ksum11 += (uint32_t) v3x11; + ksum11 += (uint32_t) v4x11; + ksum11 += (uint32_t) v5x11; + ksum11 += (uint32_t) v6x11; + ksum11 += (uint32_t) v7x11; + out[88] = v0x11; + out[89] = v1x11; + out[90] = v2x11; + out[91] = v3x11; + out[92] = v4x11; + out[93] = v5x11; + out[94] = v6x11; + out[95] = v7x11; + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + const int8_t v1x12 = w1[12]; + const int8_t v2x12 = w2[12]; + const int8_t v3x12 = w3[12]; + const int8_t v4x12 = w4[12]; + const int8_t v5x12 = w5[12]; + const int8_t v6x12 = w6[12]; + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v0x12; + ksum12 += (uint32_t) v1x12; + ksum12 += (uint32_t) v2x12; + ksum12 += (uint32_t) v3x12; + ksum12 += (uint32_t) v4x12; + ksum12 += (uint32_t) v5x12; + ksum12 += (uint32_t) v6x12; + ksum12 += (uint32_t) v7x12; + out[96] = v0x12; + out[97] = v1x12; + out[98] = v2x12; + out[99] = v3x12; + out[100] = v4x12; + out[101] = v5x12; + out[102] = v6x12; + out[103] = v7x12; + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + const int8_t v1x13 = w1[13]; + const int8_t v2x13 = w2[13]; + const int8_t v3x13 = w3[13]; + const int8_t v4x13 = w4[13]; + const int8_t v5x13 = w5[13]; + const int8_t v6x13 = w6[13]; + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v0x13; + ksum13 += (uint32_t) v1x13; + ksum13 += (uint32_t) v2x13; + ksum13 += (uint32_t) v3x13; + ksum13 += (uint32_t) v4x13; + ksum13 += (uint32_t) v5x13; + ksum13 += (uint32_t) v6x13; + ksum13 += (uint32_t) v7x13; + out[104] = v0x13; + out[105] = v1x13; + out[106] = v2x13; + out[107] = v3x13; + out[108] = v4x13; + out[109] = v5x13; + out[110] = v6x13; + out[111] = v7x13; + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + const int8_t v1x14 = w1[14]; + const int8_t v2x14 = w2[14]; + const int8_t v3x14 = w3[14]; + const int8_t v4x14 = w4[14]; + const int8_t v5x14 = w5[14]; + const int8_t v6x14 = w6[14]; + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v0x14; + ksum14 += (uint32_t) v1x14; + ksum14 += (uint32_t) v2x14; + ksum14 += (uint32_t) v3x14; + ksum14 += (uint32_t) v4x14; + ksum14 += (uint32_t) v5x14; + ksum14 += (uint32_t) v6x14; + ksum14 += (uint32_t) v7x14; + out[112] = v0x14; + out[113] = v1x14; + out[114] = v2x14; + out[115] = v3x14; + out[116] = v4x14; + out[117] = v5x14; + out[118] = v6x14; + out[119] = v7x14; + } + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 128; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + } + if (7 < n) { + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + } + if (8 < n) { + const int8_t v0x8 = w0[8]; + ksum8 += (uint32_t) v0x8; + out[64] = v0x8; + if (1 < k) { + const int8_t v1x8 = w1[8]; + ksum8 += (uint32_t) v1x8; + out[65] = v1x8; + } + if (2 < k) { + const int8_t v2x8 = w2[8]; + ksum8 += (uint32_t) v2x8; + out[66] = v2x8; + } + if (3 < k) { + const int8_t v3x8 = w3[8]; + ksum8 += (uint32_t) v3x8; + out[67] = v3x8; + } + if (4 < k) { + const int8_t v4x8 = w4[8]; + ksum8 += (uint32_t) v4x8; + out[68] = v4x8; + } + if (5 < k) { + const int8_t v5x8 = w5[8]; + ksum8 += (uint32_t) v5x8; + out[69] = v5x8; + } + if (6 < k) { + const int8_t v6x8 = w6[8]; + ksum8 += (uint32_t) v6x8; + out[70] = v6x8; + } + if (7 < k) { + const int8_t v7x8 = w7[8]; + ksum8 += (uint32_t) v7x8; + out[71] = v7x8; + } + } + if (9 < n) { + const int8_t v0x9 = w0[9]; + ksum9 += (uint32_t) v0x9; + out[72] = v0x9; + if (1 < k) { + const int8_t v1x9 = w1[9]; + ksum9 += (uint32_t) v1x9; + out[73] = v1x9; + } + if (2 < k) { + const int8_t v2x9 = w2[9]; + ksum9 += (uint32_t) v2x9; + out[74] = v2x9; + } + if (3 < k) { + const int8_t v3x9 = w3[9]; + ksum9 += (uint32_t) v3x9; + out[75] = v3x9; + } + if (4 < k) { + const int8_t v4x9 = w4[9]; + ksum9 += (uint32_t) v4x9; + out[76] = v4x9; + } + if (5 < k) { + const int8_t v5x9 = w5[9]; + ksum9 += (uint32_t) v5x9; + out[77] = v5x9; + } + if (6 < k) { + const int8_t v6x9 = w6[9]; + ksum9 += (uint32_t) v6x9; + out[78] = v6x9; + } + if (7 < k) { + const int8_t v7x9 = w7[9]; + ksum9 += (uint32_t) v7x9; + out[79] = v7x9; + } + } + if (10 < n) { + const int8_t v0x10 = w0[10]; + ksum10 += (uint32_t) v0x10; + out[80] = v0x10; + if (1 < k) { + const int8_t v1x10 = w1[10]; + ksum10 += (uint32_t) v1x10; + out[81] = v1x10; + } + if (2 < k) { + const int8_t v2x10 = w2[10]; + ksum10 += (uint32_t) v2x10; + out[82] = v2x10; + } + if (3 < k) { + const int8_t v3x10 = w3[10]; + ksum10 += (uint32_t) v3x10; + out[83] = v3x10; + } + if (4 < k) { + const int8_t v4x10 = w4[10]; + ksum10 += (uint32_t) v4x10; + out[84] = v4x10; + } + if (5 < k) { + const int8_t v5x10 = w5[10]; + ksum10 += (uint32_t) v5x10; + out[85] = v5x10; + } + if (6 < k) { + const int8_t v6x10 = w6[10]; + ksum10 += (uint32_t) v6x10; + out[86] = v6x10; + } + if (7 < k) { + const int8_t v7x10 = w7[10]; + ksum10 += (uint32_t) v7x10; + out[87] = v7x10; + } + } + if (11 < n) { + const int8_t v0x11 = w0[11]; + ksum11 += (uint32_t) v0x11; + out[88] = v0x11; + if (1 < k) { + const int8_t v1x11 = w1[11]; + ksum11 += (uint32_t) v1x11; + out[89] = v1x11; + } + if (2 < k) { + const int8_t v2x11 = w2[11]; + ksum11 += (uint32_t) v2x11; + out[90] = v2x11; + } + if (3 < k) { + const int8_t v3x11 = w3[11]; + ksum11 += (uint32_t) v3x11; + out[91] = v3x11; + } + if (4 < k) { + const int8_t v4x11 = w4[11]; + ksum11 += (uint32_t) v4x11; + out[92] = v4x11; + } + if (5 < k) { + const int8_t v5x11 = w5[11]; + ksum11 += (uint32_t) v5x11; + out[93] = v5x11; + } + if (6 < k) { + const int8_t v6x11 = w6[11]; + ksum11 += (uint32_t) v6x11; + out[94] = v6x11; + } + if (7 < k) { + const int8_t v7x11 = w7[11]; + ksum11 += (uint32_t) v7x11; + out[95] = v7x11; + } + } + if (12 < n) { + const int8_t v0x12 = w0[12]; + ksum12 += (uint32_t) v0x12; + out[96] = v0x12; + if (1 < k) { + const int8_t v1x12 = w1[12]; + ksum12 += (uint32_t) v1x12; + out[97] = v1x12; + } + if (2 < k) { + const int8_t v2x12 = w2[12]; + ksum12 += (uint32_t) v2x12; + out[98] = v2x12; + } + if (3 < k) { + const int8_t v3x12 = w3[12]; + ksum12 += (uint32_t) v3x12; + out[99] = v3x12; + } + if (4 < k) { + const int8_t v4x12 = w4[12]; + ksum12 += (uint32_t) v4x12; + out[100] = v4x12; + } + if (5 < k) { + const int8_t v5x12 = w5[12]; + ksum12 += (uint32_t) v5x12; + out[101] = v5x12; + } + if (6 < k) { + const int8_t v6x12 = w6[12]; + ksum12 += (uint32_t) v6x12; + out[102] = v6x12; + } + if (7 < k) { + const int8_t v7x12 = w7[12]; + ksum12 += (uint32_t) v7x12; + out[103] = v7x12; + } + } + if (13 < n) { + const int8_t v0x13 = w0[13]; + ksum13 += (uint32_t) v0x13; + out[104] = v0x13; + if (1 < k) { + const int8_t v1x13 = w1[13]; + ksum13 += (uint32_t) v1x13; + out[105] = v1x13; + } + if (2 < k) { + const int8_t v2x13 = w2[13]; + ksum13 += (uint32_t) v2x13; + out[106] = v2x13; + } + if (3 < k) { + const int8_t v3x13 = w3[13]; + ksum13 += (uint32_t) v3x13; + out[107] = v3x13; + } + if (4 < k) { + const int8_t v4x13 = w4[13]; + ksum13 += (uint32_t) v4x13; + out[108] = v4x13; + } + if (5 < k) { + const int8_t v5x13 = w5[13]; + ksum13 += (uint32_t) v5x13; + out[109] = v5x13; + } + if (6 < k) { + const int8_t v6x13 = w6[13]; + ksum13 += (uint32_t) v6x13; + out[110] = v6x13; + } + if (7 < k) { + const int8_t v7x13 = w7[13]; + ksum13 += (uint32_t) v7x13; + out[111] = v7x13; + } + } + if (14 < n) { + const int8_t v0x14 = w0[14]; + ksum14 += (uint32_t) v0x14; + out[112] = v0x14; + if (1 < k) { + const int8_t v1x14 = w1[14]; + ksum14 += (uint32_t) v1x14; + out[113] = v1x14; + } + if (2 < k) { + const int8_t v2x14 = w2[14]; + ksum14 += (uint32_t) v2x14; + out[114] = v2x14; + } + if (3 < k) { + const int8_t v3x14 = w3[14]; + ksum14 += (uint32_t) v3x14; + out[115] = v3x14; + } + if (4 < k) { + const int8_t v4x14 = w4[14]; + ksum14 += (uint32_t) v4x14; + out[116] = v4x14; + } + if (5 < k) { + const int8_t v5x14 = w5[14]; + ksum14 += (uint32_t) v5x14; + out[117] = v5x14; + } + if (6 < k) { + const int8_t v6x14 = w6[14]; + ksum14 += (uint32_t) v6x14; + out[118] = v6x14; + } + if (7 < k) { + const int8_t v7x14 = w7[14]; + ksum14 += (uint32_t) v7x14; + out[119] = v7x14; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 128; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + packed_b[8] -= ksum8 * izp; + packed_b[9] -= ksum9 * izp; + packed_b[10] -= ksum10 * izp; + packed_b[11] -= ksum11 * izp; + packed_b[12] -= ksum12 * izp; + packed_b[13] -= ksum13 * izp; + packed_b[14] -= ksum14 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c new file mode 100644 index 00000000000..eac3667afbd --- /dev/null +++ b/src/qs8-qu8-packw/gen/qs8-qu8-packw-x8c8-gemm-gio-scalar.c @@ -0,0 +1,1159 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_qs8_to_qu8_packw_gemm_gio_ukernel_x8c8__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const int32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const int32_t* b = (const int32_t*) bias; + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + 128): 128); + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + ((int32_t*) out)[0] = b[0]; + ((int32_t*) out)[1] = b[1]; + ((int32_t*) out)[2] = b[2]; + ((int32_t*) out)[3] = b[3]; + ((int32_t*) out)[4] = b[4]; + ((int32_t*) out)[5] = b[5]; + ((int32_t*) out)[6] = b[6]; + ((int32_t*) out)[7] = b[7]; + b += 8; + } else { + ((int32_t*) out)[0] = 0; + ((int32_t*) out)[1] = 0; + ((int32_t*) out)[2] = 0; + ((int32_t*) out)[3] = 0; + ((int32_t*) out)[4] = 0; + ((int32_t*) out)[5] = 0; + ((int32_t*) out)[6] = 0; + ((int32_t*) out)[7] = 0; + } + out += 8 * sizeof(int32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + uint32_t ksum7 = 0; + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v0x7; + ksum7 += (uint32_t) v1x7; + ksum7 += (uint32_t) v2x7; + ksum7 += (uint32_t) v3x7; + ksum7 += (uint32_t) v4x7; + ksum7 += (uint32_t) v5x7; + ksum7 += (uint32_t) v6x7; + ksum7 += (uint32_t) v7x7; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + const int8_t v0x7 = w0[7]; + ksum7 += (uint32_t) v0x7; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + ksum7 += (uint32_t) v1x7; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + ksum7 += (uint32_t) v2x7; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + ksum7 += (uint32_t) v3x7; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + ksum7 += (uint32_t) v4x7; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + ksum7 += (uint32_t) v5x7; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + ksum7 += (uint32_t) v6x7; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + ksum7 += (uint32_t) v7x7; + out[63] = v7x7; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + packed_b[7] -= ksum7 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 8; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((int32_t*) out) = *b++; + out += sizeof(int32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((int32_t*) out) = 0; + out += sizeof(int32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(int32_t); + + // NR remainder has less than 8 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + uint32_t ksum0 = 0; + uint32_t ksum1 = 0; + uint32_t ksum2 = 0; + uint32_t ksum3 = 0; + uint32_t ksum4 = 0; + uint32_t ksum5 = 0; + uint32_t ksum6 = 0; + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v0x0; + ksum0 += (uint32_t) v1x0; + ksum0 += (uint32_t) v2x0; + ksum0 += (uint32_t) v3x0; + ksum0 += (uint32_t) v4x0; + ksum0 += (uint32_t) v5x0; + ksum0 += (uint32_t) v6x0; + ksum0 += (uint32_t) v7x0; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v0x1; + ksum1 += (uint32_t) v1x1; + ksum1 += (uint32_t) v2x1; + ksum1 += (uint32_t) v3x1; + ksum1 += (uint32_t) v4x1; + ksum1 += (uint32_t) v5x1; + ksum1 += (uint32_t) v6x1; + ksum1 += (uint32_t) v7x1; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v0x2; + ksum2 += (uint32_t) v1x2; + ksum2 += (uint32_t) v2x2; + ksum2 += (uint32_t) v3x2; + ksum2 += (uint32_t) v4x2; + ksum2 += (uint32_t) v5x2; + ksum2 += (uint32_t) v6x2; + ksum2 += (uint32_t) v7x2; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v0x3; + ksum3 += (uint32_t) v1x3; + ksum3 += (uint32_t) v2x3; + ksum3 += (uint32_t) v3x3; + ksum3 += (uint32_t) v4x3; + ksum3 += (uint32_t) v5x3; + ksum3 += (uint32_t) v6x3; + ksum3 += (uint32_t) v7x3; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v0x4; + ksum4 += (uint32_t) v1x4; + ksum4 += (uint32_t) v2x4; + ksum4 += (uint32_t) v3x4; + ksum4 += (uint32_t) v4x4; + ksum4 += (uint32_t) v5x4; + ksum4 += (uint32_t) v6x4; + ksum4 += (uint32_t) v7x4; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v0x5; + ksum5 += (uint32_t) v1x5; + ksum5 += (uint32_t) v2x5; + ksum5 += (uint32_t) v3x5; + ksum5 += (uint32_t) v4x5; + ksum5 += (uint32_t) v5x5; + ksum5 += (uint32_t) v6x5; + ksum5 += (uint32_t) v7x5; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v0x6; + ksum6 += (uint32_t) v1x6; + ksum6 += (uint32_t) v2x6; + ksum6 += (uint32_t) v3x6; + ksum6 += (uint32_t) v4x6; + ksum6 += (uint32_t) v5x6; + ksum6 += (uint32_t) v6x6; + ksum6 += (uint32_t) v7x6; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + } + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + if (0 < n) { + const int8_t v0x0 = w0[0]; + ksum0 += (uint32_t) v0x0; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + ksum0 += (uint32_t) v1x0; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + ksum0 += (uint32_t) v2x0; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + ksum0 += (uint32_t) v3x0; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + ksum0 += (uint32_t) v4x0; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + ksum0 += (uint32_t) v5x0; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + ksum0 += (uint32_t) v6x0; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + ksum0 += (uint32_t) v7x0; + out[7] = v7x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + ksum1 += (uint32_t) v0x1; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + ksum1 += (uint32_t) v1x1; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + ksum1 += (uint32_t) v2x1; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + ksum1 += (uint32_t) v3x1; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + ksum1 += (uint32_t) v4x1; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + ksum1 += (uint32_t) v5x1; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + ksum1 += (uint32_t) v6x1; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + ksum1 += (uint32_t) v7x1; + out[15] = v7x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + ksum2 += (uint32_t) v0x2; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + ksum2 += (uint32_t) v1x2; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + ksum2 += (uint32_t) v2x2; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + ksum2 += (uint32_t) v3x2; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + ksum2 += (uint32_t) v4x2; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + ksum2 += (uint32_t) v5x2; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + ksum2 += (uint32_t) v6x2; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + ksum2 += (uint32_t) v7x2; + out[23] = v7x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + ksum3 += (uint32_t) v0x3; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + ksum3 += (uint32_t) v1x3; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + ksum3 += (uint32_t) v2x3; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + ksum3 += (uint32_t) v3x3; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + ksum3 += (uint32_t) v4x3; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + ksum3 += (uint32_t) v5x3; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + ksum3 += (uint32_t) v6x3; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + ksum3 += (uint32_t) v7x3; + out[31] = v7x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + ksum4 += (uint32_t) v0x4; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + ksum4 += (uint32_t) v1x4; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + ksum4 += (uint32_t) v2x4; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + ksum4 += (uint32_t) v3x4; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + ksum4 += (uint32_t) v4x4; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + ksum4 += (uint32_t) v5x4; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + ksum4 += (uint32_t) v6x4; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + ksum4 += (uint32_t) v7x4; + out[39] = v7x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + ksum5 += (uint32_t) v0x5; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + ksum5 += (uint32_t) v1x5; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + ksum5 += (uint32_t) v2x5; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + ksum5 += (uint32_t) v3x5; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + ksum5 += (uint32_t) v4x5; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + ksum5 += (uint32_t) v5x5; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + ksum5 += (uint32_t) v6x5; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + ksum5 += (uint32_t) v7x5; + out[47] = v7x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + ksum6 += (uint32_t) v0x6; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + ksum6 += (uint32_t) v1x6; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + ksum6 += (uint32_t) v2x6; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + ksum6 += (uint32_t) v3x6; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + ksum6 += (uint32_t) v4x6; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + ksum6 += (uint32_t) v5x6; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + ksum6 += (uint32_t) v6x6; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + ksum6 += (uint32_t) v7x6; + out[55] = v7x6; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + packed_b[0] -= ksum0 * izp; + packed_b[1] -= ksum1 * izp; + packed_b[2] -= ksum2 * izp; + packed_b[3] -= ksum3 * izp; + packed_b[4] -= ksum4 * izp; + packed_b[5] -= ksum5 * izp; + packed_b[6] -= ksum6 * izp; + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c b/src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c new file mode 100644 index 00000000000..16d779a7466 --- /dev/null +++ b/src/x8-packw/gen/x8-packw-x8c8-gemm-gio-scalar.c @@ -0,0 +1,886 @@ +// Auto-generated file. Do not edit! +// Template: src/x8-packw/kr-gio-scalar.c.in +// Generator: tools/xngen +// +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include +#include + +#include "xnnpack/packw.h" + +void xnn_x8_packw_gemm_gio_ukernel_x8c8__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* weights, + const uint32_t* bias, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == 8); + assert(kr == 8); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + int8_t* out = (int8_t*) packed_weights; + const uint32_t* b = (const uint32_t*) bias; + + do { + // NC main loop multiple of 8 + const int8_t* w0 = (const int8_t*) weights; + size_t n = nc; + for (;n >= 8; n -= 8) { + if XNN_LIKELY(b != NULL) { + ((uint32_t*) out)[0] = b[0]; + ((uint32_t*) out)[1] = b[1]; + ((uint32_t*) out)[2] = b[2]; + ((uint32_t*) out)[3] = b[3]; + ((uint32_t*) out)[4] = b[4]; + ((uint32_t*) out)[5] = b[5]; + ((uint32_t*) out)[6] = b[6]; + ((uint32_t*) out)[7] = b[7]; + b += 8; + } else { + ((uint32_t*) out)[0] = 0; + ((uint32_t*) out)[1] = 0; + ((uint32_t*) out)[2] = 0; + ((uint32_t*) out)[3] = 0; + ((uint32_t*) out)[4] = 0; + ((uint32_t*) out)[5] = 0; + ((uint32_t*) out)[6] = 0; + ((uint32_t*) out)[7] = 0; + } + out += 8 * sizeof(uint32_t); + + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + const int8_t v0x7 = w0[7]; + const int8_t v1x7 = w1[7]; + const int8_t v2x7 = w2[7]; + const int8_t v3x7 = w3[7]; + const int8_t v4x7 = w4[7]; + const int8_t v5x7 = w5[7]; + const int8_t v6x7 = w6[7]; + const int8_t v7x7 = w7[7]; + out[56] = v0x7; + out[57] = v1x7; + out[58] = v2x7; + out[59] = v3x7; + out[60] = v4x7; + out[61] = v5x7; + out[62] = v6x7; + out[63] = v7x7; + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + const int8_t v0x0 = w0[0]; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + out[7] = v7x0; + } + const int8_t v0x1 = w0[1]; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + out[15] = v7x1; + } + const int8_t v0x2 = w0[2]; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + out[23] = v7x2; + } + const int8_t v0x3 = w0[3]; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + out[31] = v7x3; + } + const int8_t v0x4 = w0[4]; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + out[39] = v7x4; + } + const int8_t v0x5 = w0[5]; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + out[47] = v7x5; + } + const int8_t v0x6 = w0[6]; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + out[55] = v7x6; + } + const int8_t v0x7 = w0[7]; + out[56] = v0x7; + if (1 < k) { + const int8_t v1x7 = w1[7]; + out[57] = v1x7; + } + if (2 < k) { + const int8_t v2x7 = w2[7]; + out[58] = v2x7; + } + if (3 < k) { + const int8_t v3x7 = w3[7]; + out[59] = v3x7; + } + if (4 < k) { + const int8_t v4x7 = w4[7]; + out[60] = v4x7; + } + if (5 < k) { + const int8_t v5x7 = w5[7]; + out[61] = v5x7; + } + if (6 < k) { + const int8_t v6x7 = w6[7]; + out[62] = v6x7; + } + if (7 < k) { + const int8_t v7x7 = w7[7]; + out[63] = v7x7; + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + 8; + } + + // NC remainder (1..7) + if XNN_UNLIKELY(n != 0) { + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + *((uint32_t*) out) = *b++; + out += sizeof(uint32_t); + } while (--nb != 0); + } else { + size_t nb = n; + do { + *((uint32_t*) out) = 0; + out += sizeof(uint32_t); + } while (--nb != 0); + } + out += (8 - n) * sizeof(uint32_t); + + // NR remainder has less than 8 rows so last row is not loaded + const int8_t* w1 = w0 + k_stride; + const int8_t* w2 = w1 + k_stride; + const int8_t* w3 = w2 + k_stride; + const int8_t* w4 = w3 + k_stride; + const int8_t* w5 = w4 + k_stride; + const int8_t* w6 = w5 + k_stride; + const int8_t* w7 = w6 + k_stride; + + + // KC main loop multiple of 8x8 + size_t k = kc; + for (; k >= 8; k -= 8) { + const int8_t v0x0 = w0[0]; + const int8_t v1x0 = w1[0]; + const int8_t v2x0 = w2[0]; + const int8_t v3x0 = w3[0]; + const int8_t v4x0 = w4[0]; + const int8_t v5x0 = w5[0]; + const int8_t v6x0 = w6[0]; + const int8_t v7x0 = w7[0]; + out[0] = v0x0; + out[1] = v1x0; + out[2] = v2x0; + out[3] = v3x0; + out[4] = v4x0; + out[5] = v5x0; + out[6] = v6x0; + out[7] = v7x0; + if (1 < n) { + const int8_t v0x1 = w0[1]; + const int8_t v1x1 = w1[1]; + const int8_t v2x1 = w2[1]; + const int8_t v3x1 = w3[1]; + const int8_t v4x1 = w4[1]; + const int8_t v5x1 = w5[1]; + const int8_t v6x1 = w6[1]; + const int8_t v7x1 = w7[1]; + out[8] = v0x1; + out[9] = v1x1; + out[10] = v2x1; + out[11] = v3x1; + out[12] = v4x1; + out[13] = v5x1; + out[14] = v6x1; + out[15] = v7x1; + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + const int8_t v1x2 = w1[2]; + const int8_t v2x2 = w2[2]; + const int8_t v3x2 = w3[2]; + const int8_t v4x2 = w4[2]; + const int8_t v5x2 = w5[2]; + const int8_t v6x2 = w6[2]; + const int8_t v7x2 = w7[2]; + out[16] = v0x2; + out[17] = v1x2; + out[18] = v2x2; + out[19] = v3x2; + out[20] = v4x2; + out[21] = v5x2; + out[22] = v6x2; + out[23] = v7x2; + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + const int8_t v1x3 = w1[3]; + const int8_t v2x3 = w2[3]; + const int8_t v3x3 = w3[3]; + const int8_t v4x3 = w4[3]; + const int8_t v5x3 = w5[3]; + const int8_t v6x3 = w6[3]; + const int8_t v7x3 = w7[3]; + out[24] = v0x3; + out[25] = v1x3; + out[26] = v2x3; + out[27] = v3x3; + out[28] = v4x3; + out[29] = v5x3; + out[30] = v6x3; + out[31] = v7x3; + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + const int8_t v1x4 = w1[4]; + const int8_t v2x4 = w2[4]; + const int8_t v3x4 = w3[4]; + const int8_t v4x4 = w4[4]; + const int8_t v5x4 = w5[4]; + const int8_t v6x4 = w6[4]; + const int8_t v7x4 = w7[4]; + out[32] = v0x4; + out[33] = v1x4; + out[34] = v2x4; + out[35] = v3x4; + out[36] = v4x4; + out[37] = v5x4; + out[38] = v6x4; + out[39] = v7x4; + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + const int8_t v1x5 = w1[5]; + const int8_t v2x5 = w2[5]; + const int8_t v3x5 = w3[5]; + const int8_t v4x5 = w4[5]; + const int8_t v5x5 = w5[5]; + const int8_t v6x5 = w6[5]; + const int8_t v7x5 = w7[5]; + out[40] = v0x5; + out[41] = v1x5; + out[42] = v2x5; + out[43] = v3x5; + out[44] = v4x5; + out[45] = v5x5; + out[46] = v6x5; + out[47] = v7x5; + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + const int8_t v1x6 = w1[6]; + const int8_t v2x6 = w2[6]; + const int8_t v3x6 = w3[6]; + const int8_t v4x6 = w4[6]; + const int8_t v5x6 = w5[6]; + const int8_t v6x6 = w6[6]; + const int8_t v7x6 = w7[6]; + out[48] = v0x6; + out[49] = v1x6; + out[50] = v2x6; + out[51] = v3x6; + out[52] = v4x6; + out[53] = v5x6; + out[54] = v6x6; + out[55] = v7x6; + } + w0 += 8 * k_stride; + w1 += 8 * k_stride; + w2 += 8 * k_stride; + w3 += 8 * k_stride; + w4 += 8 * k_stride; + w5 += 8 * k_stride; + w6 += 8 * k_stride; + w7 += 8 * k_stride; + out += 64; + } + + // KC remainder of 1..7 + if (k != 0) { + assert(k >= 1 && k <= 7); + if (0 < n) { + const int8_t v0x0 = w0[0]; + out[0] = v0x0; + if (1 < k) { + const int8_t v1x0 = w1[0]; + out[1] = v1x0; + } + if (2 < k) { + const int8_t v2x0 = w2[0]; + out[2] = v2x0; + } + if (3 < k) { + const int8_t v3x0 = w3[0]; + out[3] = v3x0; + } + if (4 < k) { + const int8_t v4x0 = w4[0]; + out[4] = v4x0; + } + if (5 < k) { + const int8_t v5x0 = w5[0]; + out[5] = v5x0; + } + if (6 < k) { + const int8_t v6x0 = w6[0]; + out[6] = v6x0; + } + if (7 < k) { + const int8_t v7x0 = w7[0]; + out[7] = v7x0; + } + } + if (1 < n) { + const int8_t v0x1 = w0[1]; + out[8] = v0x1; + if (1 < k) { + const int8_t v1x1 = w1[1]; + out[9] = v1x1; + } + if (2 < k) { + const int8_t v2x1 = w2[1]; + out[10] = v2x1; + } + if (3 < k) { + const int8_t v3x1 = w3[1]; + out[11] = v3x1; + } + if (4 < k) { + const int8_t v4x1 = w4[1]; + out[12] = v4x1; + } + if (5 < k) { + const int8_t v5x1 = w5[1]; + out[13] = v5x1; + } + if (6 < k) { + const int8_t v6x1 = w6[1]; + out[14] = v6x1; + } + if (7 < k) { + const int8_t v7x1 = w7[1]; + out[15] = v7x1; + } + } + if (2 < n) { + const int8_t v0x2 = w0[2]; + out[16] = v0x2; + if (1 < k) { + const int8_t v1x2 = w1[2]; + out[17] = v1x2; + } + if (2 < k) { + const int8_t v2x2 = w2[2]; + out[18] = v2x2; + } + if (3 < k) { + const int8_t v3x2 = w3[2]; + out[19] = v3x2; + } + if (4 < k) { + const int8_t v4x2 = w4[2]; + out[20] = v4x2; + } + if (5 < k) { + const int8_t v5x2 = w5[2]; + out[21] = v5x2; + } + if (6 < k) { + const int8_t v6x2 = w6[2]; + out[22] = v6x2; + } + if (7 < k) { + const int8_t v7x2 = w7[2]; + out[23] = v7x2; + } + } + if (3 < n) { + const int8_t v0x3 = w0[3]; + out[24] = v0x3; + if (1 < k) { + const int8_t v1x3 = w1[3]; + out[25] = v1x3; + } + if (2 < k) { + const int8_t v2x3 = w2[3]; + out[26] = v2x3; + } + if (3 < k) { + const int8_t v3x3 = w3[3]; + out[27] = v3x3; + } + if (4 < k) { + const int8_t v4x3 = w4[3]; + out[28] = v4x3; + } + if (5 < k) { + const int8_t v5x3 = w5[3]; + out[29] = v5x3; + } + if (6 < k) { + const int8_t v6x3 = w6[3]; + out[30] = v6x3; + } + if (7 < k) { + const int8_t v7x3 = w7[3]; + out[31] = v7x3; + } + } + if (4 < n) { + const int8_t v0x4 = w0[4]; + out[32] = v0x4; + if (1 < k) { + const int8_t v1x4 = w1[4]; + out[33] = v1x4; + } + if (2 < k) { + const int8_t v2x4 = w2[4]; + out[34] = v2x4; + } + if (3 < k) { + const int8_t v3x4 = w3[4]; + out[35] = v3x4; + } + if (4 < k) { + const int8_t v4x4 = w4[4]; + out[36] = v4x4; + } + if (5 < k) { + const int8_t v5x4 = w5[4]; + out[37] = v5x4; + } + if (6 < k) { + const int8_t v6x4 = w6[4]; + out[38] = v6x4; + } + if (7 < k) { + const int8_t v7x4 = w7[4]; + out[39] = v7x4; + } + } + if (5 < n) { + const int8_t v0x5 = w0[5]; + out[40] = v0x5; + if (1 < k) { + const int8_t v1x5 = w1[5]; + out[41] = v1x5; + } + if (2 < k) { + const int8_t v2x5 = w2[5]; + out[42] = v2x5; + } + if (3 < k) { + const int8_t v3x5 = w3[5]; + out[43] = v3x5; + } + if (4 < k) { + const int8_t v4x5 = w4[5]; + out[44] = v4x5; + } + if (5 < k) { + const int8_t v5x5 = w5[5]; + out[45] = v5x5; + } + if (6 < k) { + const int8_t v6x5 = w6[5]; + out[46] = v6x5; + } + if (7 < k) { + const int8_t v7x5 = w7[5]; + out[47] = v7x5; + } + } + if (6 < n) { + const int8_t v0x6 = w0[6]; + out[48] = v0x6; + if (1 < k) { + const int8_t v1x6 = w1[6]; + out[49] = v1x6; + } + if (2 < k) { + const int8_t v2x6 = w2[6]; + out[50] = v2x6; + } + if (3 < k) { + const int8_t v3x6 = w3[6]; + out[51] = v3x6; + } + if (4 < k) { + const int8_t v4x6 = w4[6]; + out[52] = v4x6; + } + if (5 < k) { + const int8_t v5x6 = w5[6]; + out[53] = v5x6; + } + if (6 < k) { + const int8_t v6x6 = w6[6]; + out[54] = v6x6; + } + if (7 < k) { + const int8_t v7x6 = w7[6]; + out[55] = v7x6; + } + } + w0 += k * k_stride; + w1 += k * k_stride; + w2 += k * k_stride; + w3 += k * k_stride; + w4 += k * k_stride; + w5 += k * k_stride; + w6 += k * k_stride; + w7 += k * k_stride; + out += 64; + } + + out = (int8_t*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/x8-packw/kr-gio-scalar.c.in b/src/x8-packw/kr-gio-scalar.c.in new file mode 100644 index 00000000000..d61750f6669 --- /dev/null +++ b/src/x8-packw/kr-gio-scalar.c.in @@ -0,0 +1,214 @@ +// Copyright 2023 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +$assert NR > 1 +$assert KR > 1 +$assert TYPE in ["int8_t"] +$assert IZP in [0, 128] + +#include +#include +#include + +#include "xnnpack/packw.h" + +$BTYPE = {"QS8": "int32_t", "X8": "uint32_t"}[DATATYPE] +$WTYPE = {"int8_t": "int8_t", "uint16_t": "uint16_t", "uint32_t": "uint32_t", "float": "uint32_t"}[TYPE] +void xnn_${DATATYPE.lower()}${"_to_qu8" if IZP == 128 else ""}_packw_gemm_gio_ukernel_x${NR}c${KR}__scalar( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const ${WTYPE}* weights, + const ${BTYPE}* bias, + const void* scale, + ${WTYPE}* packed_weights, + size_t extra_bytes, + const void* params) +{ + assert(g != 0); + assert(nc != 0); + assert(kc != 0); + assert(nr == ${NR}); + assert(kr == ${KR}); + assert(sr == 1); + assert(weights != NULL); + assert(packed_weights != NULL); + + ${TYPE}* out = (${TYPE}*) packed_weights; + const ${BTYPE}* b = (const ${BTYPE}*) bias; + $if DATATYPE in ["QS8"]: + const uint32_t izp = (uint32_t) (params ? (((const struct xnn_qs8_packw_params*) params)->input_zero_point + ${IZP}): ${IZP}); + + do { + // NC main loop multiple of ${NR} + const ${TYPE}* w0 = (const ${TYPE}*) weights; + size_t n = nc; + for (;n >= ${NR}; n -= ${NR}) { + $if DATATYPE in ["QS8"]: + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + $for N in range(NR): + $if BTYPE == TYPE: + out[${N}] = b[${N}]; + $else: + ((${BTYPE}*) out)[${N}] = b[${N}]; + b += ${NR}; + } else { + $for N in range(NR): + $if BTYPE == TYPE: + out[${N}] = 0; + $else: + ((${BTYPE}*) out)[${N}] = 0; + } + $if BTYPE == TYPE: + out += ${NR}; + $else: + out += ${NR} * sizeof(${BTYPE}); + + $for K in range(1, KR): + const ${TYPE}* w${K} = w${K-1} + k_stride; + $if DATATYPE in ["QS8"]: + $for N in range(NR): + uint32_t ksum${N} = 0; + + // KC main loop multiple of ${NR}x${KR} + size_t k = kc; + for (; k >= ${KR}; k -= ${KR}) { + $for N in range(NR): + $for K in range(KR): + const ${TYPE} v${K}x${N} = w${K}[${N}]; + $for K in range(KR): + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v${K}x${N}; + $for K in range(KR): + out[${N*KR+K}] = v${K}x${N}; + $for K in range(KR): + w${K} += ${KR} * k_stride; + out += ${NR*KR}; + } + + // KC remainder of 1..${KR-1} + if (k != 0) { + assert(k >= 1 && k <= ${KR-1}); + $for N in range(NR): + const ${TYPE} v0x${N} = w0[${N}]; + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v0x${N}; + out[${N*KR}] = v0x${N}; + $for K in range(1, KR): + if (${K} < k) { + const ${TYPE} v${K}x${N} = w${K}[${N}]; + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v${K}x${N}; + out[${N*KR+K}] = v${K}x${N}; + } + $for K in range(KR): + w${K} += k * k_stride; + out += ${NR*KR}; + } + + $if DATATYPE in ["QS8"]: + $for N in range(NR): + packed_b[${N}] -= ksum${N} * izp; + out = (${TYPE}*) ((uintptr_t) out + extra_bytes); + w0 = w0 - kc * k_stride + ${NR}; + } + + // NC remainder (1..${NR-1}) + if XNN_UNLIKELY(n != 0) { + $if DATATYPE in ["QS8"]: + int32_t* packed_b = (int32_t*) out; + if XNN_LIKELY(b != NULL) { + size_t nb = n; + do { + $if BTYPE == TYPE: + *out++ = *b++; + $else: + *((${BTYPE}*) out) = *b++; + out += sizeof(${BTYPE}); + } while (--nb != 0); + } else { + size_t nb = n; + do { + $if BTYPE == TYPE: + *out++ = 0; + $else: + *((${BTYPE}*) out) = 0; + out += sizeof(${BTYPE}); + } while (--nb != 0); + } + $if BTYPE == TYPE: + out += (${NR} - n); + $else: + out += (${NR} - n) * sizeof(${BTYPE}); + + $if NR > 2: + // NR remainder has less than ${NR} rows so last row is not loaded + $for K in range(1, KR): + const ${TYPE}* w${K} = w${K-1} + k_stride; + + $if DATATYPE in ["QS8"]: + $for N in range(NR-1): + uint32_t ksum${N} = 0; + + // KC main loop multiple of ${NR}x${KR} + size_t k = kc; + for (; k >= ${KR}; k -= ${KR}) { + $for K in range(KR): + const ${TYPE} v${K}x0 = w${K}[0]; + $for K in range(KR): + $if DATATYPE in ["QS8"]: + ksum0 += (uint32_t) v${K}x0; + $for K in range(KR): + out[${K}] = v${K}x0; + $for N in range(1, NR-1): + if (${N} < n) { + $for K in range(KR): + const ${TYPE} v${K}x${N} = w${K}[${N}]; + $for K in range(KR): + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v${K}x${N}; + $for K in range(KR): + out[${N*KR+K}] = v${K}x${N}; + } + $for K in range(KR): + w${K} += ${KR} * k_stride; + out += ${NR*KR}; + } + + // KC remainder of 1..${KR-1} + if (k != 0) { + assert(k >= 1 && k <= ${KR-1}); + $for N in range(NR-1): + if (${N} < n) { + const ${TYPE} v0x${N} = w0[${N}]; + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v0x${N}; + out[${N*KR}] = v0x${N}; + $for K in range(1, KR): + if (${K} < k) { + const ${TYPE} v${K}x${N} = w${K}[${N}]; + $if DATATYPE in ["QS8"]: + ksum${N} += (uint32_t) v${K}x${N}; + out[${N*KR+K}] = v${K}x${N}; + } + } + $for K in range(KR): + w${K} += k * k_stride; + out += ${NR*KR}; + } + + $if DATATYPE in ["QS8"]: + $for N in range(NR-1): + packed_b[${N}] -= ksum${N} * izp; + out = (${TYPE}*) ((uintptr_t) out + extra_bytes); + } + weights += nc * kc; + } while (--g != 0); +} diff --git a/src/x8-packw/x8-packw.h b/src/x8-packw/x8-packw.h index b13f6c181e6..69ab7b55193 100644 --- a/src/x8-packw/x8-packw.h +++ b/src/x8-packw/x8-packw.h @@ -28,6 +28,8 @@ XNN_UKERNEL(0, xnn_x8_packw_gemm_goi_ukernel_x8__scalar_u4, 8, 1, 1, 4, 1) XNN_UKERNEL(0, xnn_x8_packw_gemm_goi_ukernel_x16__scalar_u4, 16, 1, 1, 4, 1) XNN_UKERNEL(0, xnn_x8_packw_gemm_goi_ukernel_x32__scalar_u4, 32, 1, 1, 4, 1) +XNN_GIO_UKERNEL(0, xnn_x8_packw_gemm_gio_ukernel_x8c8__scalar, 8, 8, 1, 8, 1) + #if XNN_ARCH_X86_64 || XNN_ARCH_X86 XNN_UKERNEL(xnn_arch_x86_avx2, xnn_x8_packw_gemm_goi_ukernel_x8c8__avx2, 8, 8, 1, 8, 1) XNN_UKERNEL(xnn_arch_x86_avx2, xnn_x8_packw_gemm_goi_ukernel_x8c8__avx2_prfm, 8, 8, 1, 8, 1) diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index 4d7bff9e679..0299239a286 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -1318,6 +1318,21 @@ typedef void (*xnn_x8_packw_gemm_goi_ukernel_fn)( size_t extra_bytes, const void* params); +typedef void (*xnn_x8_packw_gemm_gio_ukernel_fn)( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* k, + const uint32_t* b, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params); + typedef void (*xnn_qs8_packw_gemm_goi_ukernel_fn)( size_t g, size_t nc, @@ -1332,6 +1347,21 @@ typedef void (*xnn_qs8_packw_gemm_goi_ukernel_fn)( size_t extra_bytes, const void* params); +typedef void (*xnn_qs8_packw_gemm_gio_ukernel_fn)( + size_t g, + size_t nc, + size_t kc, + size_t nr, + size_t kr, + size_t sr, + size_t k_stride, + const int8_t* k, + const int32_t* b, + const void* scale, + int8_t* packed_weights, + size_t extra_bytes, + const void* params); + typedef void (*xnn_x16_packw_gemm_goi_ukernel_fn)( size_t g, size_t nc, diff --git a/src/xnnpack/packw.h b/src/xnnpack/packw.h index 1b77fb48c2c..f93ea083223 100644 --- a/src/xnnpack/packw.h +++ b/src/xnnpack/packw.h @@ -31,9 +31,26 @@ extern "C" { size_t extra_bytes, \ const void* params); +#define XNN_GIO_UKERNEL(arch_flags, ukernel, nr_, kr_, sr_, kblock, nr_scale) \ + XNN_INTERNAL void ukernel( \ + size_t g, \ + size_t nc, \ + size_t kc, \ + size_t nr, \ + size_t kr, \ + size_t sr, \ + size_t k_stride, \ + const int8_t* weights, \ + const uint32_t* bias, \ + const void* scale, \ + int8_t* packed_weights, \ + size_t extra_bytes, \ + const void* params); + #include "x8-packw/x8-packw.h" #undef XNN_UKERNEL +#undef XNN_GIO_UKERNEL #define XNN_QS8_UKERNEL(arch_flags, ukernel, nr_, kr_, sr_, kblock, nr_scale, izp) \ XNN_INTERNAL void ukernel( \ @@ -50,9 +67,26 @@ extern "C" { size_t extra_bytes, \ const void* params); +#define XNN_QS8_GIO_UKERNEL(arch_flags, ukernel, nr_, kr_, sr_, kblock, nr_scale, izp) \ + XNN_INTERNAL void ukernel( \ + size_t g, \ + size_t nc, \ + size_t kc, \ + size_t nr, \ + size_t kr, \ + size_t sr, \ + size_t k_stride, \ + const int8_t* weights, \ + const int32_t* bias, \ + const void* scale, \ + int8_t* packed_weights, \ + size_t extra_bytes, \ + const void* params); + #include "qs8-packw/qs8-packw.h" #undef XNN_QS8_UKERNEL +#undef XNN_QS8_GIO_UKERNEL #define XNN_UKERNEL(arch_flags, ukernel, nr_, kr_, sr_, kblock, nr_scale) \ XNN_INTERNAL void ukernel( \ diff --git a/test/packw-microkernel-tester.h b/test/packw-microkernel-tester.h index 63e57499965..ecd84df9877 100644 --- a/test/packw-microkernel-tester.h +++ b/test/packw-microkernel-tester.h @@ -154,6 +154,57 @@ class PackWMicrokernelTester { } } + void Test(xnn_qs8_packw_gemm_gio_ukernel_fn packw) const { + xnnpack::Buffer weights(XNN_EXTRA_BYTES / sizeof(int8_t) + n() * k()); + xnnpack::Buffer bias(n()); + xnnpack::Buffer packed_w( + packed_n() * packed_k() + packed_n() * sizeof(uint32_t)); + xnnpack::Buffer packed_w_ref( + packed_n() * packed_k() + packed_n() * sizeof(uint32_t)); + + std::iota(weights.begin(), weights.end(), 0); + std::iota(bias.begin(), bias.end(), UINT32_C(0)); + std::fill(packed_w.begin(), packed_w.end(), INT8_C(0)); + std::fill(packed_w_ref.begin(), packed_w_ref.end(), INT8_C(0x7B)); + + const int32_t* bias_data = nullbias() ? nullptr : bias.data(); + const xnn_qs8_packing_params packing_params = { 0 }; + + // Compute reference results. + auto* pack_function = izp() == 128 ? xnn_pack_qs8_to_qu8_gemm_gio_w : xnn_pack_qs8_gemm_gio_w; + pack_function(/*g=*/1, n(), k(), nr(), kr(), sr(), n(), + reinterpret_cast(weights.data()), + bias_data, + /*scale=*/nullptr, + reinterpret_cast(packed_w_ref.data()), + /*extra_bytes=*/0, &packing_params); + + // Call optimized micro-kernel. + packw(/*g=*/1, n(), k(), nr(), kr(), sr(), n(), + weights.data(), bias_data, /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, &packing_params); + + // Verify bias results. + for (size_t i = 0; i < packed_n() * sizeof(int32_t); i++) { + if (packed_w_ref[i] != INT8_C(0x7B)) { // Allow pad to differ + EXPECT_EQ((int32_t) packed_w[i], (int32_t) packed_w_ref[i]); + } + } + + // Verify weights results. + // NOTE remainder KC is different so k() is used instead of packed_k() for loop + for (size_t ki = 0; ki < k(); ki++) { + for (size_t ni = 0; ni < (n()); ni++) { + const size_t i = packed_n() * sizeof(int32_t) + ki * packed_n() + ni; + if (packed_w_ref[i] != INT8_C(0x7B)) { // Allow pad to differ + EXPECT_EQ((int32_t) packed_w[i], (int32_t) packed_w_ref[i]) + << "kr " << kr() << " of kc " << k() << " packed_k " << packed_k() << "\n" + << "nr " << nr() << " of nc " << n() << " packed_n " << packed_n() << "\n" + << "at n " << i << " of " << (int32_t) (packed_n() * packed_k() + packed_n() * sizeof(int32_t)); + } + } + } + } + void Test(xnn_x8_packw_gemm_goi_ukernel_fn packw) const { xnnpack::Buffer weights(XNN_EXTRA_BYTES / sizeof(int8_t) + n() * k()); xnnpack::Buffer bias(n()); @@ -190,6 +241,42 @@ class PackWMicrokernelTester { } } + void Test(xnn_x8_packw_gemm_gio_ukernel_fn packw) const { + xnnpack::Buffer weights(XNN_EXTRA_BYTES / sizeof(int8_t) + n() * k()); + xnnpack::Buffer bias(n()); + xnnpack::Buffer packed_w( + packed_n() * packed_k() + packed_n() * sizeof(uint32_t)); + xnnpack::Buffer packed_w_ref( + packed_n() * packed_k() + packed_n() * sizeof(uint32_t)); + std::iota(weights.begin(), weights.end(), 0); + std::iota(bias.begin(), bias.end(), UINT32_C(0)); + std::fill(packed_w.begin(), packed_w.end(), INT8_C(0x12)); + std::fill(packed_w_ref.begin(), packed_w_ref.end(), INT8_C(0x7B)); + + const uint32_t* bias_data = nullbias() ? nullptr : bias.data(); + const xnn_qs8_packing_params packing_params = { 127 }; + + // Compute reference results. + xnn_pack_f32_qs8w_gemm_gio_w(/*g=*/1, n(), k(), nr(), kr(), sr(), n(), + reinterpret_cast(weights.data()), + reinterpret_cast(bias_data), + /*scale=*/nullptr, + reinterpret_cast(packed_w_ref.data()), + /*extra_bytes=*/0, &packing_params); + + // Call optimized micro-kernel. + packw(/*g=*/1, n(), k(), nr(), kr(), sr(), n(), + weights.data(), bias_data, /*scale=*/nullptr, packed_w.data(), /*extra_bytes=*/0, &packing_params); + + // Verify results. + for (size_t i = 0; i < (packed_n() * k() + packed_n() * sizeof(int32_t)); i++) { + if (packed_w_ref[i] != INT8_C(0x7B)) { // Allow pad to differ + EXPECT_EQ((int32_t) packed_w[i], (int32_t) packed_w_ref[i]) + << "at n " << i << " of " << (int32_t) (packed_n() * k() + packed_n()); + } + } + } + void Test(xnn_x16_packw_gemm_goi_ukernel_fn packw) const { xnnpack::Buffer weights(XNN_EXTRA_BYTES / sizeof(xnn_float16) + g() * n() * k()); xnnpack::Buffer padded_weights(g() * n() * packed_k()); diff --git a/test/qs8-packw.cc b/test/qs8-packw.cc index 23a635403c7..51c5f600bdc 100644 --- a/test/qs8-packw.cc +++ b/test/qs8-packw.cc @@ -32,11 +32,14 @@ std::string GetTestQS8Name(const testing::TestParamInfo& #define XNN_QS8_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) \ { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale, izp }, +#define XNN_QS8_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) + const XnnTestQS8Param xnn_test_qs8_params[] = { #include "qs8-packw/qs8-packw.h" }; #undef XNN_QS8_UKERNEL +#undef XNN_QS8_GIO_UKERNEL } // namespace @@ -190,3 +193,191 @@ INSTANTIATE_TEST_SUITE_P(qs8_packw, XnnTestQS8, testing::ValuesIn(xnn_test_qs8_params), GetTestQS8Name); + +// Enable on all platforms when scalar available +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + + +namespace { + + +struct XnnTestQS8GIOParam { + const char *name; + xnn_qs8_packw_gemm_gio_ukernel_fn ukernel; + uint64_t arch_flags; + size_t nr, kr, sr, kblock, nr_scale, izp; +}; + +class XnnTestQS8GIO : public testing::TestWithParam { +}; + +std::string GetTestQS8GIOName(const testing::TestParamInfo& info) { + return info.param.name; +} + +#define XNN_QS8_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) \ + { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale, izp }, + +#define XNN_QS8_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale, izp) + +const XnnTestQS8GIOParam xnn_test_qs8_gio_params[] = { +#include "qs8-packw/qs8-packw.h" +}; + +#undef XNN_QS8_UKERNEL +#undef XNN_QS8_GIO_UKERNEL + +} + +TEST_P(XnnTestQS8GIO, null_bias) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .nullbias(true) + .n(GetParam().nr * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); +} + +TEST_P(XnnTestQS8GIO, k_eq_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); +} + +TEST_P(XnnTestQS8GIO, k_div_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = GetParam().kblock; k < GetParam().kblock * 5; k += GetParam().kblock) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, k_lt_kblock) { + if (GetParam().kblock <= 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = 1; k < GetParam().kblock; k++) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, k_gt_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = GetParam().kblock + 1; k < GetParam().kblock * 5; k = xnnpack::NextPrime(k + 1)) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, n_eq_1) { + if (GetParam().nr <= 1 || GetParam().nr_scale != 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .n(1 * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); +} + + +TEST_P(XnnTestQS8GIO, n_div_nr_null_bias) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = GetParam().nr; n < GetParam().nr * 5; n += GetParam().nr) { + PackWMicrokernelTester() + .nullbias(true) + .n(n * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, n_div_nr) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = GetParam().nr; n < GetParam().nr * 5; n += GetParam().nr) { + PackWMicrokernelTester() + .n(n * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, n_lt_nr) { + if (GetParam().nr <= 1 || GetParam().nr_scale != 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = 1; n < GetParam().nr * GetParam().nr_scale; n++) { + PackWMicrokernelTester() + .n(n) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestQS8GIO, n_gt_nr) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = GetParam().nr * GetParam().nr_scale; n < GetParam().nr * GetParam().nr_scale * 5; n = xnnpack::NextPrime(n + 1)) { + PackWMicrokernelTester() + .n(n) + .k(xnnpack::NextPrime(GetParam().kblock + 1)) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .izp(GetParam().izp) + .Test(GetParam().ukernel); + } +} + +INSTANTIATE_TEST_SUITE_P(qs8_gio_packw, + XnnTestQS8GIO, + testing::ValuesIn(xnn_test_qs8_gio_params), + GetTestQS8GIOName); + +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 diff --git a/test/x8-packw.cc b/test/x8-packw.cc index 12694864f60..5ce96633f03 100644 --- a/test/x8-packw.cc +++ b/test/x8-packw.cc @@ -32,11 +32,14 @@ std::string GetTestName(const testing::TestParamInfo& info) #define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale }, +#define XNN_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) + const XnnTestParam xnn_test_params[] = { #include "x8-packw/x8-packw.h" }; #undef XNN_UKERNEL +#undef XNN_GIO_UKERNEL } // namespace @@ -164,3 +167,163 @@ INSTANTIATE_TEST_SUITE_P(x8_packw, XnnTest, testing::ValuesIn(xnn_test_params), GetTestName); + +// Enable on all platforms when scalar available +#if XNN_ARCH_X86 || XNN_ARCH_X86_64 + +namespace { + +struct XnnTestGIOParam { + const char *name; + xnn_x8_packw_gemm_gio_ukernel_fn ukernel; + uint64_t arch_flags; + size_t nr, kr, sr, kblock, nr_scale; +}; + +class XnnTestGIO : public testing::TestWithParam { +}; + +std::string GetTestGIOName(const testing::TestParamInfo& info) { + return info.param.name; +} + +#define XNN_GIO_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) \ + { #ukernel, ukernel, arch_flags, nr, kr, sr, kblock, nr_scale}, + +#define XNN_UKERNEL(arch_flags, ukernel, nr, kr, sr, kblock, nr_scale) + +const XnnTestGIOParam xnn_test_gio_params[] = { +#include "x8-packw/x8-packw.h" +}; + +#undef XNN_UKERNEL +#undef XNN_GIO_UKERNEL + +} // namespace + +TEST_P(XnnTestGIO, null_bias) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .nullbias(true) + .n(GetParam().nr * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); +} + +TEST_P(XnnTestGIO, k_eq_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); +} + +TEST_P(XnnTestGIO, k_div_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = GetParam().kblock; k < GetParam().kblock * 5; k += GetParam().kblock) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestGIO, k_lt_kblock) { + if (GetParam().kblock <= 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = 1; k < GetParam().kblock; k++) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestGIO, k_gt_kblock) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t k = GetParam().kblock + 1; k < GetParam().kblock * 5; k = xnnpack::NextPrime(k + 1)) { + PackWMicrokernelTester() + .n(GetParam().nr * GetParam().nr_scale) + .k(k) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestGIO, n_eq_1) { + if (GetParam().nr <= 1 || GetParam().nr_scale != 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + PackWMicrokernelTester() + .n(1 * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); +} + +TEST_P(XnnTestGIO, n_div_nr) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = GetParam().nr; n < GetParam().nr * 5; n += GetParam().nr) { + PackWMicrokernelTester() + .n(n * GetParam().nr_scale) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestGIO, n_lt_nr) { + if (GetParam().nr <= 1 || GetParam().nr_scale != 1) { + GTEST_SKIP(); + } + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = 1; n < GetParam().nr * GetParam().nr_scale; n++) { + PackWMicrokernelTester() + .n(n) + .k(GetParam().kblock) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} + +TEST_P(XnnTestGIO, n_gt_nr) { + TEST_REQUIRES_ARCH_FLAGS(GetParam().arch_flags); + for (size_t n = GetParam().nr * GetParam().nr_scale; n < GetParam().nr * GetParam().nr_scale * 5; n = xnnpack::NextPrime(n + 1)) { + PackWMicrokernelTester() + .n(n) + .k(xnnpack::NextPrime(GetParam().kblock + 1)) + .nr(GetParam().nr * GetParam().nr_scale) + .kr(GetParam().kr) + .sr(GetParam().sr) + .Test(GetParam().ukernel); + } +} +INSTANTIATE_TEST_SUITE_P(x8_packw_gio, + XnnTestGIO, + testing::ValuesIn(xnn_test_gio_params), + GetTestGIOName); + +#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 \ No newline at end of file