diff --git a/BUILD b/BUILD index 70daf7f3ce..898982abe6 100644 --- a/BUILD +++ b/BUILD @@ -244,6 +244,28 @@ cc_library( deps = [":hwy"], ) +cc_library( + name = "bit_set", + hdrs = ["hwy/bit_set.h"], + compatible_with = [], + copts = COPTS, + deps = [ + ":hwy", # HWY_ASSERT + ], +) + +cc_library( + name = "perf_counters", + srcs = ["hwy/perf_counters.cc"], + hdrs = ["hwy/perf_counters.h"], + compatible_with = [], + copts = COPTS, + deps = [ + ":bit_set", + ":hwy", + ], +) + cc_library( name = "profiler", hdrs = [ @@ -307,16 +329,6 @@ cc_library( ], ) -cc_library( - name = "bit_set", - hdrs = ["hwy/contrib/thread_pool/bit_set.h"], - compatible_with = [], - copts = COPTS, - deps = [ - ":hwy", # HWY_ASSERT - ], -) - cc_library( name = "topology", srcs = ["hwy/contrib/thread_pool/topology.cc"], @@ -471,18 +483,19 @@ HWY_TESTS = [ ("hwy/contrib/math/", "math_test"), ("hwy/contrib/random/", "random_test"), ("hwy/contrib/matvec/", "matvec_test"), - ("hwy/contrib/thread_pool/", "bit_set_test"), ("hwy/contrib/thread_pool/", "thread_pool_test"), ("hwy/contrib/thread_pool/", "topology_test"), ("hwy/contrib/unroller/", "unroller_test"), # contrib/sort has its own BUILD, we also add sort_test to GUITAR_TESTS. # To run bench_sort, specify --test=hwy/contrib/sort:bench_sort. ("hwy/examples/", "skeleton_test"), - ("hwy/", "nanobenchmark_test"), ("hwy/", "abort_test"), ("hwy/", "aligned_allocator_test"), ("hwy/", "base_test"), + ("hwy/", "bit_set_test"), ("hwy/", "highway_test"), + ("hwy/", "nanobenchmark_test"), + ("hwy/", "perf_counters_test"), ("hwy/", "targets_test"), ("hwy/tests/", "arithmetic_test"), ("hwy/tests/", "bit_permute_test"), @@ -545,13 +558,14 @@ HWY_TEST_DEPS = [ ":bit_pack", ":bit_set", ":dot", - ":hwy", ":hwy_test_util", + ":hwy", ":image", ":math", - ":random", ":matvec", ":nanobenchmark", + ":perf_counters", + ":random", ":skeleton", ":thread_pool", ":topology", diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d4da15266..e51020c0f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,6 +202,7 @@ if (NOT HWY_CMAKE_HEADER_ONLY) hwy/aligned_allocator.cc hwy/nanobenchmark.cc hwy/per_target.cc + hwy/perf_counters.cc hwy/print.cc hwy/targets.cc hwy/timer.cc @@ -663,8 +664,10 @@ set(HWY_TEST_FILES hwy/abort_test.cc hwy/aligned_allocator_test.cc hwy/base_test.cc + hwy/bit_set_test.cc hwy/highway_test.cc hwy/nanobenchmark_test.cc + hwy/perf_counters_test.cc hwy/targets_test.cc hwy/examples/skeleton_test.cc hwy/tests/arithmetic_test.cc @@ -729,7 +732,6 @@ list(APPEND HWY_TEST_FILES hwy/contrib/random/random_test.cc hwy/contrib/sort/sort_test.cc hwy/contrib/sort/bench_sort.cc - hwy/contrib/thread_pool/bit_set_test.cc hwy/contrib/thread_pool/thread_pool_test.cc hwy/contrib/thread_pool/topology_test.cc hwy/contrib/unroller/unroller_test.cc diff --git a/hwy/contrib/thread_pool/bit_set.h b/hwy/bit_set.h similarity index 96% rename from hwy/contrib/thread_pool/bit_set.h rename to hwy/bit_set.h index 16b0c40381..f8f921becf 100644 --- a/hwy/contrib/thread_pool/bit_set.h +++ b/hwy/bit_set.h @@ -13,8 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_ -#define HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_ +#ifndef HIGHWAY_HWY_BIT_SET_H_ +#define HIGHWAY_HWY_BIT_SET_H_ // BitSet with fast Foreach for up to 64 and 4096 members. @@ -155,4 +155,4 @@ class BitSet4096 { } // namespace hwy -#endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_ +#endif // HIGHWAY_HWY_BIT_SET_H_ diff --git a/hwy/contrib/thread_pool/bit_set_test.cc b/hwy/bit_set_test.cc similarity index 100% rename from hwy/contrib/thread_pool/bit_set_test.cc rename to hwy/bit_set_test.cc diff --git a/hwy/contrib/thread_pool/topology.h b/hwy/contrib/thread_pool/topology.h index 35a725cedf..95b0835bda 100644 --- a/hwy/contrib/thread_pool/topology.h +++ b/hwy/contrib/thread_pool/topology.h @@ -23,7 +23,7 @@ #include #include "hwy/base.h" -#include "hwy/contrib/thread_pool/bit_set.h" +#include "hwy/bit_set.h" namespace hwy { diff --git a/hwy/perf_counters.cc b/hwy/perf_counters.cc new file mode 100644 index 0000000000..058e8a274e --- /dev/null +++ b/hwy/perf_counters.cc @@ -0,0 +1,234 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/perf_counters.h" + +#include "hwy/detect_compiler_arch.h" // HWY_OS_LINUX + +#if (HWY_OS_LINUX && HWY_CXX_LANG >= 201402L) || HWY_IDE +#include +#include // open +#include +#include +#include +#include +#include // strcmp +#include +#include // O_RDONLY +#include +#include + +#include + +#include "hwy/base.h" // HWY_ASSERT +#include "hwy/bit_set.h" + +#endif // HWY_OS_LINUX .. + +namespace hwy { +namespace platform { + +#if HWY_OS_LINUX && HWY_CXX_LANG >= 201402L + +struct CounterConfig { // for perf_event_open + uint64_t config; + uint32_t type; +}; + +CounterConfig FindCounterConfig(const char* name) { + const auto eq = [name](const char* literal) { + return !strcmp(name, literal); + }; + + constexpr uint32_t kHW = PERF_TYPE_HARDWARE; + if (eq("ref_cycle")) return {PERF_COUNT_HW_REF_CPU_CYCLES, kHW}; + if (eq("instruction")) return {PERF_COUNT_HW_INSTRUCTIONS, kHW}; + if (eq("branch")) return {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW}; + if (eq("branch_mispred")) return {PERF_COUNT_HW_BRANCH_MISSES, kHW}; + if (eq("frontend_stall")) return {PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, kHW}; + if (eq("backend_stall")) return {PERF_COUNT_HW_STALLED_CYCLES_BACKEND, kHW}; + + constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL; + constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8; + constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8; + constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16; + constexpr uint64_t kMiss = uint64_t{PERF_COUNT_HW_CACHE_RESULT_MISS} << 16; + if (eq("l3_load")) return {kL3 | kLoad | kAcc, PERF_TYPE_HW_CACHE}; + if (eq("l3_store")) return {kL3 | kStore | kAcc, PERF_TYPE_HW_CACHE}; + if (eq("l3_load_miss")) return {kL3 | kLoad | kMiss, PERF_TYPE_HW_CACHE}; + if (eq("l3_store_miss")) return {kL3 | kStore | kMiss, PERF_TYPE_HW_CACHE}; + + if (eq("page_fault")) return {PERF_COUNT_SW_PAGE_FAULTS, PERF_TYPE_SOFTWARE}; + + HWY_ABORT("Bug: name %s does not match any known counter", name); +} + +class PMU::Impl { + static bool PerfCountersSupported() { + // This is the documented way. + struct stat s; + return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0; + } + + static perf_event_attr MakeAttr(const CounterConfig& cc) { + perf_event_attr attr = {}; + attr.type = cc.type; + attr.size = sizeof(attr); + attr.config = cc.config; + // We request more counters than the HW may support. If so, they are + // multiplexed and only active for a fraction of the runtime. Recording the + // times lets us extrapolate. Avoid GROUP because we want per-counter times. + attr.read_format = + PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + attr.inherit = 1; + attr.exclude_kernel = 1; // required if perf_event_paranoid == 1 + attr.exclude_hv = 1; // = hypervisor + return attr; + } + + static int SysPerfEventOpen(const CounterConfig& cc, int group_fd) { + perf_event_attr attr = MakeAttr(cc); + // Only disable the group leader; other counters are gated on it. + if (group_fd == -1) { + attr.disabled = 1; + } + + const int pid = 0; // current process + const int cpu = -1; // any CPU + return syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, + /*flags=*/0); + } + + struct Buf { + uint64_t value; + uint64_t time_enabled; + uint64_t time_running; + }; + + public: + Impl() { + if (!PerfCountersSupported()) { + fprintf(stderr, + "This Linux does not support perf counters. The program will" + "continue, but counters will return zero.\n"); + return; + } + + // Use groups so that all counters are enabled at the same time. + int group_fd = -1; + + fds_.reserve(PerfCounters::Num()); + size_t idx_counter = 0; // for valid_ + + PerfCounters counters; // unused + counters.ForEach( // requires C++14 lambda template + &counters, [&](auto& /*val*/, auto& /*val2*/, const char* name) { + const CounterConfig config = FindCounterConfig(name); + const int fd = SysPerfEventOpen(config, group_fd); + if (fd < 0) { + fprintf(stderr, "perf_event_open error %d for counter %s\n", errno, + name); + } else { + // Set event count to zero to make overflow less likely. + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + + if (group_fd == -1) group_fd = fd; + + valid_.Set(idx_counter); + fds_.push_back(fd); + } + + if (idx_counter == 0) { + // Ensure the first counter is a HW event, because later adding a HW + // event to a group with only SW events is slow. + HWY_ASSERT(config.type == PERF_TYPE_HARDWARE); + } + ++idx_counter; + }); + + HWY_ASSERT(fds_.size() == valid_.Count()); + } + + ~Impl() { + for (int fd : fds_) { + HWY_ASSERT(fd >= 0); + HWY_ASSERT(close(fd) == 0); + } + } + + bool Start() { + if (fds_.empty()) return false; // ctor failed + // Enabling the first fd (group leader) enables all. + HWY_ASSERT(ioctl(fds_[0], PERF_EVENT_IOC_ENABLE, 0) == 0); + return true; + } + + double Stop(PerfCounters& counters) { + if (fds_.empty()) return 0.0; // ctor failed + + // First stop all so that we measure over the same time interval. + ioctl(fds_[0], PERF_EVENT_IOC_DISABLE, 0); + + double min_fraction = 1.0; + // Visits in the same order they were initialized. + size_t idx_counter = 0; + size_t idx_fd = 0; + counters.ForEach( + &counters, [&](auto& val, auto& /*val2*/, const char* name) { + using T = hwy::RemoveRef; + val = T{0}; + Buf buf; + if (valid_.Get(idx_counter)) { + const int fd = fds_[idx_fd++]; + AGAIN: + const ssize_t bytes_read = read(fd, &buf, sizeof(buf)); + if (bytes_read < static_cast(sizeof(buf))) { + if (errno == EAGAIN) goto AGAIN; + fprintf(stderr, "perf_counters read() error %d for %s\n", errno, + name); + } else { + HWY_ASSERT(buf.time_running <= buf.time_enabled); + if (buf.time_running != 0) { + const double fraction = + static_cast(buf.time_running) / buf.time_enabled; + HWY_ASSERT(0.0 < fraction && fraction <= 1.0); + min_fraction = HWY_MIN(min_fraction, fraction); + val = static_cast(static_cast(buf.value) / fraction); + } + } + } + ++idx_counter; + }); + return min_fraction; + } + + private: + BitSet64 valid_; // which counters are available + std::vector fds_; // size == valid_.Count() +}; + +PMU::PMU() : impl_(new Impl) {} +PMU::~PMU() = default; +bool PMU::Start() { return impl_->Start(); } +double PMU::Stop(PerfCounters& counters) { return impl_->Stop(counters); } +#else +PMU::PMU() {} +PMU::~PMU() = default; +bool PMU::Start() { return false; } +double PMU::Stop(PerfCounters& /*counters*/) { return 0.0; } +#endif // HWY_OS_LINUX .. + +} // namespace platform +} // namespace hwy diff --git a/hwy/perf_counters.h b/hwy/perf_counters.h new file mode 100644 index 0000000000..b9d3d024d5 --- /dev/null +++ b/hwy/perf_counters.h @@ -0,0 +1,89 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HIGHWAY_HWY_PERF_COUNTERS_H_ +#define HIGHWAY_HWY_PERF_COUNTERS_H_ + +// Reads OS/CPU performance counters. + +#include + +#include // unique_ptr + +namespace hwy { +namespace platform { + +#pragma pack(push, 1) +class PerfCounters { + public: + static constexpr size_t Num() { return 11; } + + template + void ForEach(PerfCounters* other, const Visitor& visitor) { + visitor(this->ref_cycle, other->ref_cycle, "ref_cycle"); + visitor(this->instruction, other->instruction, "instruction"); + visitor(this->branch, other->branch, "branch"); + visitor(this->branch_mispred, other->branch_mispred, "branch_mispred"); + visitor(this->frontend_stall, other->frontend_stall, "frontend_stall"); + visitor(this->backend_stall, other->backend_stall, "backend_stall"); + visitor(this->l3_load, other->l3_load, "l3_load"); + visitor(this->l3_store, other->l3_store, "l3_store"); + visitor(this->l3_load_miss, other->l3_load_miss, "l3_load_miss"); + visitor(this->l3_store_miss, other->l3_store_miss, "l3_store_miss"); + // must be last, see GetCounterConfigs. + visitor(this->page_fault, other->page_fault, "page_fault"); + } + + // Floating-point because these are extrapolated (multiplexing). We want this + // to fit in one cache line to reduce cost in profiler.h, hence use individual + // members with smaller types instead of an array. Ensure all values are sums, + // not ratios, so that profiler.h can add/subtract them. + double ref_cycle; + double instruction; + float branch; + float branch_mispred; + float frontend_stall; // [cycles] + float backend_stall; // [cycles] + float l3_load; + float l3_store; + float l3_load_miss; + float l3_store_miss; + float page_fault; +}; +#pragma pack(pop) + +// Holds state required for reading PerfCounters. Expensive to create. +class PMU { + public: + PMU(); + ~PMU(); + + // Returns false if counters are unavailable, otherwise starts them. + bool Start(); + + // Returns 0.0 on error; otherwise the minimum coverage of any counter, i.e., + // the fraction of the time between Start and Stop that the counter was + // active, and overwrites `counters` with the extrapolated values since Start. + double Stop(PerfCounters& counters); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace platform +} // namespace hwy + +#endif // HIGHWAY_HWY_PERF_COUNTERS_H_ diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc new file mode 100644 index 0000000000..01e34ab398 --- /dev/null +++ b/hwy/perf_counters_test.cc @@ -0,0 +1,67 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwy/perf_counters.h" + +#include +#include +#include + +#include "hwy/nanobenchmark.h" +#include "hwy/tests/hwy_gtest.h" +#include "hwy/tests/test_util-inl.h" + +namespace hwy { +namespace { + +TEST(NanobenchmarkTest, RunTest) { + RandomState rng; + platform::PMU pmu; + if (pmu.Start()) { + const size_t iters = (hwy::Unpredictable1() * 1000) + (rng() & 1); + uint64_t r = rng(); + fprintf(stderr, "r: %zu\n", r); + for (size_t i = 0; i < iters; ++i) { + if (PopCount(rng()) < 36) { + r -= rng() & 0xF; + } else { + // Entirely different operation to ensure there is a branch. + r >>= 1; + } + } + + platform::PerfCounters counters; + const double min_coverage = pmu.Stop(counters); + fprintf(stderr, "r: %d, coverage %f\n", static_cast(r), min_coverage); + if (min_coverage != 0.0) { +#if HWY_CXX_LANG >= 201402L + counters.ForEach(&counters, + [](auto& val, auto& /*val2*/, const char* name) { + fprintf(stderr, "%-20s: %.3E\n", name, val); + }); +#endif + + HWY_ASSERT(counters.ref_cycle > 1000); + HWY_ASSERT(counters.instruction > 1000); + HWY_ASSERT(counters.branch > 1000); + HWY_ASSERT(counters.branch_mispred > 200); + } + } +} + +} // namespace +} // namespace hwy + +HWY_TEST_MAIN();