Skip to content

Commit

Permalink
[WIP] Packw Benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
mcr229 committed Dec 9, 2024
1 parent 3f03824 commit 452b90d
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1947,6 +1947,7 @@ IF(XNNPACK_BUILD_BENCHMARKS)
qu8-gemm
qu8-gemm-fp32
qu8-gemm-rndnu
qb4-packw
x16-packw
x32-packw
x8-lut
Expand Down
67 changes: 67 additions & 0 deletions bench/packw-benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,73 @@ static void x8_gio_packw(benchmark::State& state,
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
}

static void qb4_packw(benchmark::State& state,
xnn_qb4_packw_gemm_goi_ukernel_fn packw,
size_t nr, size_t kr, size_t sr, size_t bl,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check != nullptr && !isa_check(state)) {
return;
}

const size_t batch = 1; // batch is g parameter for packw
const size_t dim_n = state.range(2); // dim_n is nc parameter
const size_t dim_k = state.range(3); // dim_k is kc parameter

const size_t rounded_n = benchmark::utils::RoundUp(dim_n, nr);
const size_t rounded_k = benchmark::utils::RoundUp(dim_k, bl);

std::random_device random_device;
auto rng = std::mt19937(random_device());

// Computer num_buffers that fit cache with source weights + packed_weights.
const size_t num_buffers = 1 +
benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
batch * (dim_n * dim_k + rounded_n * rounded_k + rounded_n * sizeof(uint32_t)));

xnnpack::Buffer<uint8_t, XNN_ALLOCATION_ALIGNMENT> weights(num_buffers * batch *
dim_n * (rounded_k >> 1));
xnnpack::fill_uniform_random_bits(weights.data(), weights.size(), rng);
xnnpack::Buffer<int8_t, XNN_ALLOCATION_ALIGNMENT> packed_weights(
num_buffers * batch *
(rounded_n * (rounded_k >> 1) + rounded_n * sizeof(uint32_t)));
xnnpack::Buffer<int32_t, XNN_ALLOCATION_ALIGNMENT> bias(num_buffers * batch * dim_n);
xnnpack::fill_uniform_random_bits(bias.data(), bias.size(), rng);
size_t num_blocks = rounded_k / bl;
xnnpack::Buffer<xnn_bfloat16, XNN_ALLOCATION_ALIGNMENT> bf16_scales(num_blocks * batch * dim_n);
xnnpack::fill_uniform_random_bits(bf16_scales.data(), bf16_scales.size(), rng);

const xnn_qs8_qc4w_packing_params packing_params = { 1, 8 };

size_t buffer_index = 0;
for (auto _ : state) {
if (++buffer_index == num_buffers) {
buffer_index = 0;
}

packw(1, dim_n, rounded_k, nr, kr, sr, bl,
weights.data() + buffer_index * batch * dim_n * (rounded_k >> 1),
/*bias=*/bias.data() + buffer_index * batch * dim_n,
/*scale=*/bf16_scales.data() + buffer_index * batch * dim_n,
packed_weights.data() + buffer_index * batch * (rounded_n * (rounded_k >> 1) + rounded_n * sizeof(uint32_t) + rounded_n * sizeof(uint16_t)),
/*extra_bytes_bl=*/sizeof(uint16_t) * nr, sizeof(float), &packing_params);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

const size_t elements_per_iteration = batch * dim_n * (rounded_k >> 1);
state.counters["elements"] =
benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);

const size_t bytes_per_iteration = (elements_per_iteration + batch * (rounded_n * rounded_k + rounded_n * sizeof(uint32_t)));
state.counters["bytes"] =
benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
}


static void qs8_packw(benchmark::State& state,
xnn_qs8_packw_gemm_goi_ukernel_fn packw,
size_t nr, size_t kr, size_t sr,
Expand Down
32 changes: 32 additions & 0 deletions bench/qb4-packw.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.


#include <benchmark/benchmark.h>
#include "bgemm.h"
#include "packw-benchmark.h"
#include "utils.h"
#include "xnnpack/common.h"
#include "xnnpack/hardware-config.h"
#include "xnnpack/packw.h"

static void qb4_packw(benchmark::State& state, const char* net,
xnn_qb4_packw_gemm_goi_ukernel_fn ukernel,
uint64_t arch_flags, size_t nr, size_t kr, size_t sr, size_t bl) {
benchmark::utils::CheckArchFlags(state, arch_flags);
qb4_packw(state, ukernel, nr, kr, sr, bl);
}

#define XNN_QB4_UKERNEL(arch_flags, ukernel, nr, kr, sr, bl, kblock, nr_scale, izp) \
BENCHMARK_CAPTURE_BGEMM(qb4_packw, ukernel##_, ukernel, arch_flags, nr, kr, sr, bl);

#include "qb4-packw/qb4-packw.h"

#undef XNN_QB4_UKERNEL


#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
1 change: 0 additions & 1 deletion bench/qs8-packw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,3 @@ BENCHMARK_CAPTURE_BGEMM(qs8_gio_packw, ukernel##_, ukernel, arch_flags, nr, kr,
#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif

0 comments on commit 452b90d

Please sign in to comment.