From 452b90ddacb2580c969e33159a05a3c63fcdaa4b Mon Sep 17 00:00:00 2001 From: Max Ren Date: Tue, 26 Nov 2024 11:47:58 -0500 Subject: [PATCH] [WIP] Packw Benchmarks --- CMakeLists.txt | 1 + bench/packw-benchmark.h | 67 +++++++++++++++++++++++++++++++++++++++++ bench/qb4-packw.cc | 32 ++++++++++++++++++++ bench/qs8-packw.cc | 1 - 4 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 bench/qb4-packw.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 18a8d92ebaa..882f851c772 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1947,6 +1947,7 @@ IF(XNNPACK_BUILD_BENCHMARKS) qu8-gemm qu8-gemm-fp32 qu8-gemm-rndnu + qb4-packw x16-packw x32-packw x8-lut diff --git a/bench/packw-benchmark.h b/bench/packw-benchmark.h index fe11340878a..39b61a892d3 100644 --- a/bench/packw-benchmark.h +++ b/bench/packw-benchmark.h @@ -136,6 +136,73 @@ static void x8_gio_packw(benchmark::State& state, benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); } +static void qb4_packw(benchmark::State& state, + xnn_qb4_packw_gemm_goi_ukernel_fn packw, + size_t nr, size_t kr, size_t sr, size_t bl, + benchmark::utils::IsaCheckFunction isa_check = nullptr) +{ + if (isa_check != nullptr && !isa_check(state)) { + return; + } + + const size_t batch = 1; // batch is g parameter for packw + const size_t dim_n = state.range(2); // dim_n is nc parameter + const size_t dim_k = state.range(3); // dim_k is kc parameter + + const size_t rounded_n = benchmark::utils::RoundUp(dim_n, nr); + const size_t rounded_k = benchmark::utils::RoundUp(dim_k, bl); + + std::random_device random_device; + auto rng = std::mt19937(random_device()); + + // Computer num_buffers that fit cache with source weights + packed_weights. + const size_t num_buffers = 1 + + benchmark::utils::DivideRoundUp(benchmark::utils::GetMaxCacheSize(), + batch * (dim_n * dim_k + rounded_n * rounded_k + rounded_n * sizeof(uint32_t))); + + xnnpack::Buffer weights(num_buffers * batch * + dim_n * (rounded_k >> 1)); + xnnpack::fill_uniform_random_bits(weights.data(), weights.size(), rng); + xnnpack::Buffer packed_weights( + num_buffers * batch * + (rounded_n * (rounded_k >> 1) + rounded_n * sizeof(uint32_t))); + xnnpack::Buffer bias(num_buffers * batch * dim_n); + xnnpack::fill_uniform_random_bits(bias.data(), bias.size(), rng); + size_t num_blocks = rounded_k / bl; + xnnpack::Buffer bf16_scales(num_blocks * batch * dim_n); + xnnpack::fill_uniform_random_bits(bf16_scales.data(), bf16_scales.size(), rng); + + const xnn_qs8_qc4w_packing_params packing_params = { 1, 8 }; + + size_t buffer_index = 0; + for (auto _ : state) { + if (++buffer_index == num_buffers) { + buffer_index = 0; + } + + packw(1, dim_n, rounded_k, nr, kr, sr, bl, + weights.data() + buffer_index * batch * dim_n * (rounded_k >> 1), + /*bias=*/bias.data() + buffer_index * batch * dim_n, + /*scale=*/bf16_scales.data() + buffer_index * batch * dim_n, + packed_weights.data() + buffer_index * batch * (rounded_n * (rounded_k >> 1) + rounded_n * sizeof(uint32_t) + rounded_n * sizeof(uint16_t)), + /*extra_bytes_bl=*/sizeof(uint16_t) * nr, sizeof(float), &packing_params); + } + + const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); + if (cpu_frequency != 0) { + state.counters["cpufreq"] = cpu_frequency; + } + + const size_t elements_per_iteration = batch * dim_n * (rounded_k >> 1); + state.counters["elements"] = + benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); + + const size_t bytes_per_iteration = (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n * sizeof(uint32_t))); + state.counters["bytes"] = + benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); +} + + static void qs8_packw(benchmark::State& state, xnn_qs8_packw_gemm_goi_ukernel_fn packw, size_t nr, size_t kr, size_t sr, diff --git a/bench/qb4-packw.cc b/bench/qb4-packw.cc new file mode 100644 index 00000000000..b378b331fe1 --- /dev/null +++ b/bench/qb4-packw.cc @@ -0,0 +1,32 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + + +#include +#include "bgemm.h" +#include "packw-benchmark.h" +#include "utils.h" +#include "xnnpack/common.h" +#include "xnnpack/hardware-config.h" +#include "xnnpack/packw.h" + +static void qb4_packw(benchmark::State& state, const char* net, + xnn_qb4_packw_gemm_goi_ukernel_fn ukernel, + uint64_t arch_flags, size_t nr, size_t kr, size_t sr, size_t bl) { + benchmark::utils::CheckArchFlags(state, arch_flags); + qb4_packw(state, ukernel, nr, kr, sr, bl); +} + +#define XNN_QB4_UKERNEL(arch_flags, ukernel, nr, kr, sr, bl, kblock, nr_scale, izp) \ +BENCHMARK_CAPTURE_BGEMM(qb4_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr, bl); + +#include "qb4-packw/qb4-packw.h" + +#undef XNN_QB4_UKERNEL + + +#ifndef XNNPACK_BENCHMARK_NO_MAIN +BENCHMARK_MAIN(); +#endif diff --git a/bench/qs8-packw.cc b/bench/qs8-packw.cc index e4d8ee97cbb..a0d1e860efa 100644 --- a/bench/qs8-packw.cc +++ b/bench/qs8-packw.cc @@ -40,4 +40,3 @@ BENCHMARK_CAPTURE_BGEMM(qs8_gio_packw, ukernel##_, ukernel, arch_flags, nr, kr, #ifndef XNNPACK_BENCHMARK_NO_MAIN BENCHMARK_MAIN(); #endif -