Merge branch 'master' into backup3

kimwalisch · Mar 5, 2021 · 259f841 · 259f841
2 parents 1ebd2a4 + 4e02809
commit 259f841
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 30 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,4 +1,4 @@
-Changes in primecount-6.3, 2021-03-01
+Changes in primecount-6.3, 2021-03-02
 
   The memory usage of the PiTable and SegmentedPiTable has been
   reduced by about 2x, this speeds up the computation of the easy
@@ -11,6 +11,7 @@ Changes in primecount-6.3, 2021-03-01
 
   * PiTable.cpp: Reduce memory usage by 2x.
   * SegmentedPiTable.cpp: Reduce memory usage by 2x.
+  * Sieve.cpp: Reduce memory usage of counters array by 2x.
   * phi.cpp: Fixed integer overflow #39.
   * phi.cpp: Cache 40x more phi(x, a) results.
   * phi.cpp: Use pi(x) if a > pi(sqrt(x)).

diff --git a/include/Sieve.hpp b/include/Sieve.hpp
@@ -104,7 +104,7 @@ class Sieve
   uint64_t counters_dist_log2_ = 0;
   uint64_t counters_stop_ = 0;
   pod_vector<uint8_t> sieve_;
-  pod_vector<uint64_t> counters_;
+  pod_vector<uint32_t> counters_;
   std::vector<Wheel> wheel_;
 };
 

diff --git a/src/Sieve.cpp b/src/Sieve.cpp
@@ -61,7 +61,7 @@ struct WheelInit
 /// Categorize sieving primes according to their modulo 30
 /// congruence class { 1, 7, 11, 13, 17, 19, 23, 29 }.
 ///
-const array<int, 30> wheel_offsets =
+const array<uint8_t, 30> wheel_offsets =
 {
   0, 8 * 0, 0, 0, 0, 0,
   0, 8 * 1, 0, 0, 0, 8 * 2,
@@ -73,15 +73,15 @@ const array<int, 30> wheel_offsets =
 /// Used to calculate the first multiple > start of a
 /// sieving prime that is coprime to 2, 3, 5.
 ///
-const WheelInit wheel_init[30] =
-{
+const array<WheelInit, 30> wheel_init
+{{
   {1,  0}, {0,  0}, {5,  1}, {4,  1}, {3,  1},
   {2,  1}, {1,  1}, {0,  1}, {3,  2}, {2,  2},
   {1,  2}, {0,  2}, {1,  3}, {0,  3}, {3,  4},
   {2,  4}, {1,  4}, {0,  4}, {1,  5}, {0,  5},
   {3,  6}, {2,  6}, {1,  6}, {0,  6}, {5,  7},
   {4,  7}, {3,  7}, {2,  7}, {1,  7}, {0,  7}
-};
+}};
 
 } // namespace
 
@@ -120,7 +120,7 @@ Sieve::Sieve(uint64_t low,
 ///
 void Sieve::allocate_counters(uint64_t low)
 {
-  double average_leaf_dist = sqrt((double) low);
+  double average_leaf_dist = sqrt(low);
   double counters_dist = sqrt(average_leaf_dist);
 
   // Here we balance counting with the counters array and
@@ -137,11 +137,17 @@ void Sieve::allocate_counters(uint64_t low)
   uint64_t byte_dist = counters_dist_ / 30;
   byte_dist = max(byte_dist, 64);
   byte_dist = next_power_of_2(byte_dist);
-  counters_dist_ = byte_dist * 30;
-  counters_dist_log2_ = ilog2(byte_dist);
 
+  // Make sure the counters (32-bit) don't overflow.
+  // This can never happen since each counters array element
+  // only counts the number of unsieved elements (1 bits) in
+  // an interval of size: sieve_limit^(1/4) * sqrt(240).
+  // Hence the max(counter value) = 2^18.
+  assert(byte_dist * 8 <= numeric_limits<uint32_t>::max());
   uint64_t counters_size = ceil_div(sieve_.size(), byte_dist);
   counters_.resize(counters_size);
+  counters_dist_ = byte_dist * 30;
+  counters_dist_log2_ = ilog2(byte_dist);
 }
 
 /// The segment size is sieve.size() * 30 as each
@@ -204,9 +210,9 @@ void Sieve::init_counters(uint64_t low, uint64_t high)
     stop = min(stop, max_stop);
     uint64_t cnt = count(start, stop);
     uint64_t byte_index = start / 30;
+    uint64_t i = byte_index >> counters_dist_log2_;
 
-    counters_[byte_index >> counters_dist_log2_] = cnt;
-
+    counters_[i] = (uint32_t) cnt;
     total_count_ += cnt;
     start += counters_dist_;
   }
@@ -541,7 +547,7 @@ void Sieve::cross_off_count(uint64_t prime, uint64_t i)
   uint64_t total_count = total_count_;
   uint64_t counters_dist_log2 = counters_dist_log2_;
   uint64_t sieve_size = sieve_.size();
-  uint64_t* counters = counters_.data();
+  uint32_t* counters = counters_.data();
   uint8_t* sieve = sieve_.data();
 
   #define CHECK_FINISHED(wheel_index) \

diff --git a/src/deleglise-rivat/S2_hard.cpp b/src/deleglise-rivat/S2_hard.cpp
@@ -18,7 +18,7 @@
 ///        method, Revista do DETUA, vol. 4, no. 6, March 2006,
 ///        pp. 759-768.
 ///
-/// Copyright (C) 2020 Kim Walisch, <[email protected]>
+/// Copyright (C) 2021 Kim Walisch, <[email protected]>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -71,12 +71,13 @@ T S2_hard_thread(T x,
   T sum = 0;
 
   int64_t low = thread.low;
+  int64_t low1 = max(low, 1);
   int64_t segments = thread.segments;
   int64_t segment_size = thread.segment_size;
-  int64_t pi_sqrty = pi[isqrt(y)];
-  int64_t low1 = max(low, 1);
   int64_t limit = min(low + segments * segment_size, z);
-  int64_t max_b = pi[min3(isqrt(x / low1), isqrt(z), y)];
+  int64_t pi_sqrty = pi[isqrt(y)];
+  int64_t max_b = (limit <= y) ? pi_sqrty
+      : pi[min3(isqrt(x / low1), isqrt(z), y)];
   int64_t min_b = pi[min(z / limit, primes[max_b])];
   min_b = max(c, min_b) + 1;
 
@@ -94,6 +95,8 @@ T S2_hard_thread(T x,
     int64_t high = min(low + segment_size, limit);
     low1 = max(low, 1);
 
+    // For b < min_b there are no special leaves:
+    // low <= x / (primes[b] * m) < high
     sieve.pre_sieve(primes, min_b - 1, low, high);
     int64_t b = min_b;
 

diff --git a/src/gourdon/D.cpp b/src/gourdon/D.cpp
@@ -15,7 +15,7 @@
 ///        compressed lookup table of moebius function values,
 ///        least prime factors and max prime factors.
 ///
-/// Copyright (C) 2020 Kim Walisch, <[email protected]>
+/// Copyright (C) 2021 Kim Walisch, <[email protected]>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -61,10 +61,10 @@ T D_thread(T x,
   T sum = 0;
 
   int64_t low = thread.low;
+  int64_t low1 = max(low, 1);
   int64_t segments = thread.segments;
   int64_t segment_size = thread.segment_size;
   int64_t pi_sqrtz = pi[isqrt(z)];
-  int64_t low1 = max(low, 1);
   int64_t limit = min(low + segments * segment_size, xz);
   int64_t max_b = pi[min3(isqrt(x / low1), isqrt(limit), x_star)];
   int64_t min_b = pi[min(xz / limit, x_star)];
@@ -84,8 +84,8 @@ T D_thread(T x,
     int64_t high = min(low + segment_size, limit);
     low1 = max(low, 1);
 
-    // For i < min_b there are no special leaves:
-    // low <= x / (primes[i] * m) < high
+    // For b < min_b there are no special leaves:
+    // low <= x / (primes[b] * m) < high
     sieve.pre_sieve(primes, min_b - 1, low, high);
     int64_t b = min_b;
 

diff --git a/src/lmo/pi_lmo_parallel.cpp b/src/lmo/pi_lmo_parallel.cpp
@@ -18,7 +18,7 @@
 ///        method, Revista do DETUA, vol. 4, no. 6, March 2006,
 ///        pp. 759-768.
 ///
-/// Copyright (C) 2020 Kim Walisch, <[email protected]>
+/// Copyright (C) 2021 Kim Walisch, <[email protected]>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -59,14 +59,16 @@ int64_t S2_thread(int64_t x,
 {
   int64_t sum = 0;
   int64_t low = thread.low;
+  int64_t low1 = max(low, 1);
   int64_t segments = thread.segments;
   int64_t segment_size = thread.segment_size;
-  int64_t low1 = max(low, 1);
+  int64_t pi_sqrty = pi[isqrt(y)];
   int64_t limit = min(low + segments * segment_size, z + 1);
   int64_t max_b = pi[min(isqrt(x / low1), y - 1)];
-  int64_t pi_sqrty = pi[isqrt(y)];
+  int64_t min_b = pi[min(z / limit, primes[max_b])];
+  min_b = max(c, min_b) + 1;
 
-  if (c >= max_b)
+  if (min_b > max_b)
     return 0;
 
   Sieve sieve(low, segment_size, max_b);
@@ -80,8 +82,10 @@ int64_t S2_thread(int64_t x,
     int64_t high = min(low + segment_size, limit);
     low1 = max(low, 1);
 
-    sieve.pre_sieve(primes, c, low, high);
-    int64_t b = c + 1;
+    // For b < min_b there are no special leaves:
+    // low <= x / (primes[b] * m) < high
+    sieve.pre_sieve(primes, min_b - 1, low, high);
+    int64_t b = min_b;
 
     // For c + 1 <= b <= pi_sqrty
     // Find all special leaves in the current segment that are

diff --git a/src/mpi/D_mpi.cpp b/src/mpi/D_mpi.cpp
@@ -4,7 +4,7 @@
 ///        algorithm) that has been distributed using MPI (Message
 ///        Passing Interface) and multi-threaded using OpenMP.
 ///
-/// Copyright (C) 2020 Kim Walisch, <[email protected]>
+/// Copyright (C) 2021 Kim Walisch, <[email protected]>
 ///
 /// This file is distributed under the BSD License. See the COPYING
 /// file in the top level directory.
@@ -52,10 +52,10 @@ T D_thread(T x,
   T sum = 0;
 
   int64_t low = thread.low;
+  int64_t low1 = max(low, 1);
   int64_t segments = thread.segments;
   int64_t segment_size = thread.segment_size;
   int64_t pi_sqrtz = pi[isqrt(z)];
-  int64_t low1 = max(low, 1);
   int64_t limit = min(low + segments * segment_size, xz);
   int64_t max_b = pi[min3(isqrt(x / low1), isqrt(limit), x_star)];
   int64_t min_b = pi[min(xz / limit, x_star)];
@@ -75,8 +75,8 @@ T D_thread(T x,
     int64_t high = min(low + segment_size, limit);
     low1 = max(low, 1);
 
-    // For i < min_b there are no special leaves:
-    // low <= x / (primes[i] * m) < high
+    // For b < min_b there are no special leaves:
+    // low <= x / (primes[b] * m) < high
     sieve.pre_sieve(primes, min_b - 1, low, high);
     int64_t b = min_b;