Skip to content

Commit

Permalink
Reduce code bloat
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jun 20, 2024
1 parent 4a44eda commit 58243da
Showing 1 changed file with 45 additions and 15 deletions.
60 changes: 45 additions & 15 deletions include/popcnt.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@

#include <stdint.h>

#if defined(ENABLE_CPUID_POPCNT)

namespace {

/// This uses fewer arithmetic operations than any other known
/// implementation on machines with fast multiplication.
/// It uses 12 arithmetic operations, one of which is a multiply.
/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
///
inline uint64_t popcnt64_bitwise(uint64_t x)
NOINLINE uint64_t popcnt64_bitwise_noinline(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ll;
uint64_t m2 = 0x3333333333333333ll;
Expand All @@ -40,6 +42,8 @@ inline uint64_t popcnt64_bitwise(uint64_t x)

} // namespace

#endif

// GCC & Clang
#if defined(__GNUC__) || \
__has_builtin(__builtin_popcountl)
Expand All @@ -62,11 +66,11 @@ inline uint64_t popcnt64(uint64_t x)
}
else
{
// On x86 and x64 CPUs when using the GCC compiler
// __builtin_popcount*(x) is slow (not inlined function call)
// when compiling without -mpopcnt. Therefore we avoid
// using __builtin_popcount*(x) here.
return popcnt64_bitwise(x);
// Preventing this function from being inlined reduces code
// bloat and reduces primecount's binary size by about 5%.
// Since virtually all x86 CPUs support the POPCNT instruction
// this code path is very rarely executed.
return popcnt64_bitwise_noinline(x);
}
}

Expand All @@ -88,11 +92,11 @@ inline uint64_t popcnt64(uint64_t x)
}
else
{
// On x86 and x64 CPUs when using the GCC compiler
// __builtin_popcount*(x) is slow (not inlined function call)
// when compiling without -mpopcnt. Therefore we avoid
// using __builtin_popcount*(x) here.
return popcnt64_bitwise(x);
// Preventing this function from being inlined reduces code
// bloat and reduces primecount's binary size by about 5%.
// Since virtually all x86 CPUs support the POPCNT instruction
// this code path is very rarely executed.
return popcnt64_bitwise_noinline(x);
}
}

Expand Down Expand Up @@ -138,9 +142,22 @@ inline uint64_t popcnt64(uint64_t x)
if_likely(cpu_supports_popcnt)
return __popcnt64(x);
else
return popcnt64_bitwise(x);
return popcnt64_bitwise_noinline(x);
#else
return popcnt64_bitwise(x);
// This uses fewer arithmetic operations than any other known
// implementation on machines with fast multiplication.
// It uses 12 arithmetic operations, one of which is a multiply.
// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
uint64_t m1 = 0x5555555555555555ll;
uint64_t m2 = 0x3333333333333333ll;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
uint64_t h01 = 0x0101010101010101ll;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
#endif
}

Expand All @@ -164,9 +181,22 @@ inline uint64_t popcnt64(uint64_t x)
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
else
return popcnt64_bitwise(x);
return popcnt64_bitwise_noinline(x);
#else
return popcnt64_bitwise(x);
// This uses fewer arithmetic operations than any other known
// implementation on machines with fast multiplication.
// It uses 12 arithmetic operations, one of which is a multiply.
// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
uint64_t m1 = 0x5555555555555555ll;
uint64_t m2 = 0x3333333333333333ll;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
uint64_t h01 = 0x0101010101010101ll;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;

return (x * h01) >> 56;
#endif
}

Expand Down

0 comments on commit 58243da

Please sign in to comment.