Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move bit_set to core #2205

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,28 @@ cc_library(
deps = [":hwy"],
)

cc_library(
name = "bit_set",
hdrs = ["hwy/bit_set.h"],
compatible_with = [],
copts = COPTS,
deps = [
":hwy", # HWY_ASSERT
],
)

cc_library(
name = "perf_counters",
srcs = ["hwy/perf_counters.cc"],
hdrs = ["hwy/perf_counters.h"],
compatible_with = [],
copts = COPTS,
deps = [
":bit_set",
":hwy",
],
)

cc_library(
name = "profiler",
hdrs = [
Expand Down Expand Up @@ -307,16 +329,6 @@ cc_library(
],
)

cc_library(
name = "bit_set",
hdrs = ["hwy/contrib/thread_pool/bit_set.h"],
compatible_with = [],
copts = COPTS,
deps = [
":hwy", # HWY_ASSERT
],
)

cc_library(
name = "topology",
srcs = ["hwy/contrib/thread_pool/topology.cc"],
Expand Down Expand Up @@ -471,18 +483,19 @@ HWY_TESTS = [
("hwy/contrib/math/", "math_test"),
("hwy/contrib/random/", "random_test"),
("hwy/contrib/matvec/", "matvec_test"),
("hwy/contrib/thread_pool/", "bit_set_test"),
("hwy/contrib/thread_pool/", "thread_pool_test"),
("hwy/contrib/thread_pool/", "topology_test"),
("hwy/contrib/unroller/", "unroller_test"),
# contrib/sort has its own BUILD, we also add sort_test to GUITAR_TESTS.
# To run bench_sort, specify --test=hwy/contrib/sort:bench_sort.
("hwy/examples/", "skeleton_test"),
("hwy/", "nanobenchmark_test"),
("hwy/", "abort_test"),
("hwy/", "aligned_allocator_test"),
("hwy/", "base_test"),
("hwy/", "bit_set_test"),
("hwy/", "highway_test"),
("hwy/", "nanobenchmark_test"),
("hwy/", "perf_counters_test"),
("hwy/", "targets_test"),
("hwy/tests/", "arithmetic_test"),
("hwy/tests/", "bit_permute_test"),
Expand Down Expand Up @@ -545,13 +558,14 @@ HWY_TEST_DEPS = [
":bit_pack",
":bit_set",
":dot",
":hwy",
":hwy_test_util",
":hwy",
":image",
":math",
":random",
":matvec",
":nanobenchmark",
":perf_counters",
":random",
":skeleton",
":thread_pool",
":topology",
Expand Down
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ if (NOT HWY_CMAKE_HEADER_ONLY)
hwy/aligned_allocator.cc
hwy/nanobenchmark.cc
hwy/per_target.cc
hwy/perf_counters.cc
hwy/print.cc
hwy/targets.cc
hwy/timer.cc
Expand Down Expand Up @@ -663,8 +664,10 @@ set(HWY_TEST_FILES
hwy/abort_test.cc
hwy/aligned_allocator_test.cc
hwy/base_test.cc
hwy/bit_set_test.cc
hwy/highway_test.cc
hwy/nanobenchmark_test.cc
hwy/perf_counters_test.cc
hwy/targets_test.cc
hwy/examples/skeleton_test.cc
hwy/tests/arithmetic_test.cc
Expand Down Expand Up @@ -729,7 +732,6 @@ list(APPEND HWY_TEST_FILES
hwy/contrib/random/random_test.cc
hwy/contrib/sort/sort_test.cc
hwy/contrib/sort/bench_sort.cc
hwy/contrib/thread_pool/bit_set_test.cc
hwy/contrib/thread_pool/thread_pool_test.cc
hwy/contrib/thread_pool/topology_test.cc
hwy/contrib/unroller/unroller_test.cc
Expand Down
6 changes: 3 additions & 3 deletions hwy/contrib/thread_pool/bit_set.h → hwy/bit_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_
#define HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_
#ifndef HIGHWAY_HWY_BIT_SET_H_
#define HIGHWAY_HWY_BIT_SET_H_

// BitSet with fast Foreach for up to 64 and 4096 members.

Expand Down Expand Up @@ -155,4 +155,4 @@ class BitSet4096 {

} // namespace hwy

#endif // HIGHWAY_HWY_CONTRIB_THREAD_POOL_BIT_SET_H_
#endif // HIGHWAY_HWY_BIT_SET_H_
File renamed without changes.
2 changes: 1 addition & 1 deletion hwy/contrib/thread_pool/topology.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <vector>

#include "hwy/base.h"
#include "hwy/contrib/thread_pool/bit_set.h"
#include "hwy/bit_set.h"

namespace hwy {

Expand Down
234 changes: 234 additions & 0 deletions hwy/perf_counters.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "hwy/perf_counters.h"

#include "hwy/detect_compiler_arch.h" // HWY_OS_LINUX

#if (HWY_OS_LINUX && HWY_CXX_LANG >= 201402L) || HWY_IDE
#include <errno.h>
#include <fcntl.h> // open
#include <linux/perf_event.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h> // strcmp
#include <sys/ioctl.h>
#include <sys/stat.h> // O_RDONLY
#include <sys/syscall.h>
#include <unistd.h>

#include <vector>

#include "hwy/base.h" // HWY_ASSERT
#include "hwy/bit_set.h"

#endif // HWY_OS_LINUX ..

namespace hwy {
namespace platform {

#if HWY_OS_LINUX && HWY_CXX_LANG >= 201402L

struct CounterConfig { // for perf_event_open
uint64_t config;
uint32_t type;
};

CounterConfig FindCounterConfig(const char* name) {
const auto eq = [name](const char* literal) {
return !strcmp(name, literal);
};

constexpr uint32_t kHW = PERF_TYPE_HARDWARE;
if (eq("ref_cycle")) return {PERF_COUNT_HW_REF_CPU_CYCLES, kHW};
if (eq("instruction")) return {PERF_COUNT_HW_INSTRUCTIONS, kHW};
if (eq("branch")) return {PERF_COUNT_HW_BRANCH_INSTRUCTIONS, kHW};
if (eq("branch_mispred")) return {PERF_COUNT_HW_BRANCH_MISSES, kHW};
if (eq("frontend_stall")) return {PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, kHW};
if (eq("backend_stall")) return {PERF_COUNT_HW_STALLED_CYCLES_BACKEND, kHW};

constexpr uint64_t kL3 = PERF_COUNT_HW_CACHE_LL;
constexpr uint64_t kLoad = uint64_t{PERF_COUNT_HW_CACHE_OP_READ} << 8;
constexpr uint64_t kStore = uint64_t{PERF_COUNT_HW_CACHE_OP_WRITE} << 8;
constexpr uint64_t kAcc = uint64_t{PERF_COUNT_HW_CACHE_RESULT_ACCESS} << 16;
constexpr uint64_t kMiss = uint64_t{PERF_COUNT_HW_CACHE_RESULT_MISS} << 16;
if (eq("l3_load")) return {kL3 | kLoad | kAcc, PERF_TYPE_HW_CACHE};
if (eq("l3_store")) return {kL3 | kStore | kAcc, PERF_TYPE_HW_CACHE};
if (eq("l3_load_miss")) return {kL3 | kLoad | kMiss, PERF_TYPE_HW_CACHE};
if (eq("l3_store_miss")) return {kL3 | kStore | kMiss, PERF_TYPE_HW_CACHE};

if (eq("page_fault")) return {PERF_COUNT_SW_PAGE_FAULTS, PERF_TYPE_SOFTWARE};

HWY_ABORT("Bug: name %s does not match any known counter", name);
}

class PMU::Impl {
static bool PerfCountersSupported() {
// This is the documented way.
struct stat s;
return stat("/proc/sys/kernel/perf_event_paranoid", &s) == 0;
}

static perf_event_attr MakeAttr(const CounterConfig& cc) {
perf_event_attr attr = {};
attr.type = cc.type;
attr.size = sizeof(attr);
attr.config = cc.config;
// We request more counters than the HW may support. If so, they are
// multiplexed and only active for a fraction of the runtime. Recording the
// times lets us extrapolate. Avoid GROUP because we want per-counter times.
attr.read_format =
PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
attr.inherit = 1;
attr.exclude_kernel = 1; // required if perf_event_paranoid == 1
attr.exclude_hv = 1; // = hypervisor
return attr;
}

static int SysPerfEventOpen(const CounterConfig& cc, int group_fd) {
perf_event_attr attr = MakeAttr(cc);
// Only disable the group leader; other counters are gated on it.
if (group_fd == -1) {
attr.disabled = 1;
}

const int pid = 0; // current process
const int cpu = -1; // any CPU
return syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd,
/*flags=*/0);
}

struct Buf {
uint64_t value;
uint64_t time_enabled;
uint64_t time_running;
};

public:
Impl() {
if (!PerfCountersSupported()) {
fprintf(stderr,
"This Linux does not support perf counters. The program will"
"continue, but counters will return zero.\n");
return;
}

// Use groups so that all counters are enabled at the same time.
int group_fd = -1;

fds_.reserve(PerfCounters::Num());
size_t idx_counter = 0; // for valid_

PerfCounters counters; // unused
counters.ForEach( // requires C++14 lambda template
&counters, [&](auto& /*val*/, auto& /*val2*/, const char* name) {
const CounterConfig config = FindCounterConfig(name);
const int fd = SysPerfEventOpen(config, group_fd);
if (fd < 0) {
fprintf(stderr, "perf_event_open error %d for counter %s\n", errno,
name);
} else {
// Set event count to zero to make overflow less likely.
ioctl(fd, PERF_EVENT_IOC_RESET, 0);

if (group_fd == -1) group_fd = fd;

valid_.Set(idx_counter);
fds_.push_back(fd);
}

if (idx_counter == 0) {
// Ensure the first counter is a HW event, because later adding a HW
// event to a group with only SW events is slow.
HWY_ASSERT(config.type == PERF_TYPE_HARDWARE);
}
++idx_counter;
});

HWY_ASSERT(fds_.size() == valid_.Count());
}

~Impl() {
for (int fd : fds_) {
HWY_ASSERT(fd >= 0);
HWY_ASSERT(close(fd) == 0);
}
}

bool Start() {
if (fds_.empty()) return false; // ctor failed
// Enabling the first fd (group leader) enables all.
HWY_ASSERT(ioctl(fds_[0], PERF_EVENT_IOC_ENABLE, 0) == 0);
return true;
}

double Stop(PerfCounters& counters) {
if (fds_.empty()) return 0.0; // ctor failed

// First stop all so that we measure over the same time interval.
ioctl(fds_[0], PERF_EVENT_IOC_DISABLE, 0);

double min_fraction = 1.0;
// Visits in the same order they were initialized.
size_t idx_counter = 0;
size_t idx_fd = 0;
counters.ForEach(
&counters, [&](auto& val, auto& /*val2*/, const char* name) {
using T = hwy::RemoveRef<decltype(val)>;
val = T{0};
Buf buf;
if (valid_.Get(idx_counter)) {
const int fd = fds_[idx_fd++];
AGAIN:
const ssize_t bytes_read = read(fd, &buf, sizeof(buf));
if (bytes_read < static_cast<ssize_t>(sizeof(buf))) {
if (errno == EAGAIN) goto AGAIN;
fprintf(stderr, "perf_counters read() error %d for %s\n", errno,
name);
} else {
HWY_ASSERT(buf.time_running <= buf.time_enabled);
if (buf.time_running != 0) {
const double fraction =
static_cast<double>(buf.time_running) / buf.time_enabled;
HWY_ASSERT(0.0 < fraction && fraction <= 1.0);
min_fraction = HWY_MIN(min_fraction, fraction);
val = static_cast<T>(static_cast<double>(buf.value) / fraction);
}
}
}
++idx_counter;
});
return min_fraction;
}

private:
BitSet64 valid_; // which counters are available
std::vector<int> fds_; // size == valid_.Count()
};

PMU::PMU() : impl_(new Impl) {}
PMU::~PMU() = default;
bool PMU::Start() { return impl_->Start(); }
double PMU::Stop(PerfCounters& counters) { return impl_->Stop(counters); }
#else
PMU::PMU() {}
PMU::~PMU() = default;
bool PMU::Start() { return false; }
double PMU::Stop(PerfCounters& /*counters*/) { return 0.0; }
#endif // HWY_OS_LINUX ..

} // namespace platform
} // namespace hwy
Loading
Loading