Reduce code bloat

kimwalisch · Jun 20, 2024 · 58243da · 58243da
1 parent 4a44eda
commit 58243da
Showing 1 changed file with 45 additions and 15 deletions.
diff --git a/include/popcnt.hpp b/include/popcnt.hpp
@@ -17,14 +17,16 @@
 
 #include <stdint.h>
 
+#if defined(ENABLE_CPUID_POPCNT)
+
 namespace {
 
 /// This uses fewer arithmetic operations than any other known
 /// implementation on machines with fast multiplication.
 /// It uses 12 arithmetic operations, one of which is a multiply.
 /// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
 ///
-inline uint64_t popcnt64_bitwise(uint64_t x)
+NOINLINE uint64_t popcnt64_bitwise_noinline(uint64_t x)
 {
   uint64_t m1 = 0x5555555555555555ll;
   uint64_t m2 = 0x3333333333333333ll;
@@ -40,6 +42,8 @@ inline uint64_t popcnt64_bitwise(uint64_t x)
 
 } // namespace
 
+#endif
+
 // GCC & Clang
 #if defined(__GNUC__) || \
     __has_builtin(__builtin_popcountl)
@@ -62,11 +66,11 @@ inline uint64_t popcnt64(uint64_t x)
   }
   else
   {
-    // On x86 and x64 CPUs when using the GCC compiler
-    // __builtin_popcount*(x) is slow (not inlined function call)
-    // when compiling without -mpopcnt. Therefore we avoid
-    // using __builtin_popcount*(x) here.
-    return popcnt64_bitwise(x);
+    // Preventing this function from being inlined reduces code
+    // bloat and reduces primecount's binary size by about 5%.
+    // Since virtually all x86 CPUs support the POPCNT instruction
+    // this code path is very rarely executed.
+    return popcnt64_bitwise_noinline(x);
   }
 }
 
@@ -88,11 +92,11 @@ inline uint64_t popcnt64(uint64_t x)
   }
   else
   {
-    // On x86 and x64 CPUs when using the GCC compiler
-    // __builtin_popcount*(x) is slow (not inlined function call)
-    // when compiling without -mpopcnt. Therefore we avoid
-    // using __builtin_popcount*(x) here.
-    return popcnt64_bitwise(x);
+    // Preventing this function from being inlined reduces code
+    // bloat and reduces primecount's binary size by about 5%.
+    // Since virtually all x86 CPUs support the POPCNT instruction
+    // this code path is very rarely executed.
+    return popcnt64_bitwise_noinline(x);
   }
 }
 
@@ -138,9 +142,22 @@ inline uint64_t popcnt64(uint64_t x)
   if_likely(cpu_supports_popcnt)
     return __popcnt64(x);
   else
-    return popcnt64_bitwise(x);
+    return popcnt64_bitwise_noinline(x);
 #else
-  return popcnt64_bitwise(x);
+  // This uses fewer arithmetic operations than any other known
+  // implementation on machines with fast multiplication.
+  // It uses 12 arithmetic operations, one of which is a multiply.
+  // http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+  uint64_t m1 = 0x5555555555555555ll;
+  uint64_t m2 = 0x3333333333333333ll;
+  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
+  uint64_t h01 = 0x0101010101010101ll;
+
+  x -= (x >> 1) & m1;
+  x = (x & m2) + ((x >> 2) & m2);
+  x = (x + (x >> 4)) & m4;
+
+  return (x * h01) >> 56;
 #endif
 }
 
@@ -164,9 +181,22 @@ inline uint64_t popcnt64(uint64_t x)
     return __popcnt(uint32_t(x)) +
            __popcnt(uint32_t(x >> 32));
   else
-    return popcnt64_bitwise(x);
+    return popcnt64_bitwise_noinline(x);
 #else
-  return popcnt64_bitwise(x);
+  // This uses fewer arithmetic operations than any other known
+  // implementation on machines with fast multiplication.
+  // It uses 12 arithmetic operations, one of which is a multiply.
+  // http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
+  uint64_t m1 = 0x5555555555555555ll;
+  uint64_t m2 = 0x3333333333333333ll;
+  uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
+  uint64_t h01 = 0x0101010101010101ll;
+
+  x -= (x >> 1) & m1;
+  x = (x & m2) + ((x >> 2) & m2);
+  x = (x + (x >> 4)) & m4;
+
+  return (x * h01) >> 56;
 #endif
 }