Skip to content

Commit

Permalink
topology fixes for M3
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 698391594
  • Loading branch information
jan-wassenberg authored and copybara-github committed Nov 20, 2024
1 parent 8a0602d commit 5242c3f
Showing 1 changed file with 45 additions and 31 deletions.
76 changes: 45 additions & 31 deletions hwy/contrib/thread_pool/topology.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,39 @@ HWY_CONTRIB_DLLEXPORT bool HaveThreadingSupport() {
#endif
}

// Returns `whole / part`, with a check that `part` evenly divides `whole`,
// which implies the result is exact.
static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {
HWY_ASSERT(part != 0);
const size_t div = whole / part;
const size_t mul = div * part;
if (mul != whole) {
HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);
}
return div;
}

#if HWY_OS_APPLE

// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to
// `out`, otherwise sets `err`.
template <typename T>
bool Sysctl(const char* name, size_t div, int& err, T* out) {
size_t val = 0;
size_t size = sizeof(val);
// Last two arguments are for updating the value, which we do not want.
const int ret = sysctlbyname(name, &val, &size, nullptr, 0);
if (HWY_UNLIKELY(ret != 0)) {
// Do not print warnings because some `name` are expected to fail.
err = ret;
return false;
}
*out = static_cast<T>(DivByFactor(val, div));
return true;
}

#endif // HWY_OS_APPLE

HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
size_t lp = 0;
#if HWY_ARCH_WASM
Expand All @@ -99,6 +132,11 @@ HWY_CONTRIB_DLLEXPORT size_t TotalLogicalProcessors() {
} else {
lp = static_cast<size_t>(ret);
}
#elif HWY_OS_APPLE
int err;
if (!Sysctl("hw.logicalcpu", 1, err, &lp)) {
lp = 0;
}
#endif

if (HWY_UNLIKELY(lp == 0)) { // Failed to detect.
Expand Down Expand Up @@ -568,18 +606,6 @@ HWY_CONTRIB_DLLEXPORT Topology::Topology() {

using Caches = std::array<Cache, 4>;

// Returns `whole / part`, with a check that `part` evenly divides `whole`,
// which implies the result is exact.
static HWY_MAYBE_UNUSED size_t DivByFactor(size_t whole, size_t part) {
HWY_ASSERT(part != 0);
const size_t div = whole / part;
const size_t mul = div * part;
if (mul != whole) {
HWY_ABORT("%zu / %zu = %zu; *%zu = %zu\n", whole, part, div, part, mul);
}
return div;
}

// We assume homogeneous caches across all clusters because some OS APIs return
// a single value for a class of CPUs.

Expand Down Expand Up @@ -798,22 +824,6 @@ static bool InitCachesWin(Caches& caches) {
#endif // HWY_OS_WIN

#if HWY_OS_APPLE
// Returns whether sysctlbyname() succeeded; if so, writes `val / div` to
// `out`, otherwise sets `err`.
template <typename T>
bool Sysctl(const char* name, size_t div, int& err, T* out) {
size_t val = 0;
size_t size = sizeof(val);
// Last two arguments are for updating the value, which we do not want.
const int ret = sysctlbyname(name, &val, &size, nullptr, 0);
if (HWY_UNLIKELY(ret != 0)) {
// Do not print warnings because some `name` are expected to fail.
err = ret;
return false;
}
*out = static_cast<T>(DivByFactor(val, div));
return true;
}

static bool InitCachesApple(Caches& caches) {
int err = 0;
Expand All @@ -834,7 +844,9 @@ static bool InitCachesApple(Caches& caches) {
}
L1.cores_sharing = 1;
if (Sysctl("hw.perflevel0.cpusperl2", 1, err, &L2.cores_sharing)) {
L2.size_kib = DivByFactor(L2.size_kib, L2.cores_sharing);
// There exist CPUs for which L2 is not evenly divisible by `cores_sharing`,
// hence do not use `DivByFactor`. It is safer to round down.
L2.size_kib /= L2.cores_sharing;
} else {
L2.cores_sharing = 1;
}
Expand All @@ -844,7 +856,7 @@ static bool InitCachesApple(Caches& caches) {
char brand[128] = {0};
size_t size = sizeof(brand);
if (!sysctlbyname("machdep.cpu.brand_string", brand, &size, nullptr, 0)) {
if (!strncmp(brand, "Apple ", 6)) {
if (strncmp(brand, "Apple ", 6) != 0) {
// Unexpected, but we will continue check the string suffixes.
HWY_WARN("unexpected Apple brand %s\n", brand);
}
Expand Down Expand Up @@ -900,8 +912,10 @@ static bool InitCachesApple(Caches& caches) {
if (L3.size_kib == 0 &&
(Sysctl("hw.perflevel0.l3cachesize", 1024, err, &L3.size_kib) ||
Sysctl("hw.l3cachesize", 1024, err, &L3.size_kib))) {
// There exist CPUs for which L3 is not evenly divisible by `cores_sharing`,
// hence do not use `DivByFactor`. It is safer to round down.
if (Sysctl("hw.perflevel0.cpusperl3", 1, err, &L3.cores_sharing)) {
L3.size_kib = DivByFactor(L3.size_kib, L3.cores_sharing);
L3.size_kib /= L3.cores_sharing;
} else {
L3.cores_sharing = 1;
}
Expand Down

0 comments on commit 5242c3f

Please sign in to comment.