From 77e6ddfc0050f501c5635ebb86d3db504ba40207 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:30:59 -0600 Subject: [PATCH 01/19] [macOS] Comparison between exactly same types Signed-off-by: Shreyas Atre --- libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index b29652a3ede1..d86fc5775415 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1); + HPX_TEST_EQ(i.get_tag(), 1UL); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0); + HPX_TEST_EQ(j.get_next_tag(), 0UL); } { From 0a0fec58616db5e149b082689dd65ce2e813c078 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 23 Jan 2024 11:32:20 -0600 Subject: [PATCH 02/19] [macOS] Apple does not seem to have any typedef for unsigned long int Signed-off-by: Shreyas Atre --- libs/core/debugging/src/print.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 3d7cf5da2aa0..8a01d9574853 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,6 +57,10 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); +#ifdef __APPLE__ + template HPX_CORE_EXPORT void print_dec( + std::ostream&, unsigned long const&, int); +#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int); From b68fc468d4019ac12cd5d5a60dc3ff18a8099db0 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Wed, 4 Dec 2024 00:51:55 -0600 Subject: [PATCH 03/19] Fix some issues and add tests for determinism - Add Kate's CUDA impl for RFA TODO: - Use original RFA instead of Kate's - Make a parallel version out of it - Make a partition vector suitable version Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 2 +- .../hpx/parallel/algorithms/detail/rfa.hpp | 1 - .../parallel/algorithms/detail/rfa_cuda.hpp | 1168 +++++++++++++++++ .../tests/unit/algorithms/CMakeLists.txt | 4 + .../unit/algorithms/reduce_deterministic.cpp | 1 - 5 files changed, 1173 insertions(+), 3 deletions(-) create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index b37730889172..7e29adce97cf 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index 2c72f10bfb91..b8f5da5f233a 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -245,7 +245,6 @@ namespace hpx::parallel::detail::rfa { ///The number of deposits that can be performed before a renorm is necessary. ///Applies also to binned complex double precision. static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); - ///Return a binned floating-point reference bin inline const ftype* binned_bins(const int x) const { diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp new file mode 100644 index 000000000000..05f71d9ae746 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp @@ -0,0 +1,1168 @@ +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include + +#ifndef __CUDACC__ +#define __host__ +#define __device__ +#define __forceinline__ +#include +using std::array; +using std::max; +using std::min; +#else +#include +using cuda::std::array; +using cuda::std::max; +using cuda::std::min; +#include "vector.hpp" +#endif + +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + + template + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept + { + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; + } + + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char bin_host_buffer[sizeof(RFA_bins)]; +#ifdef __CUDACC__ + __constant__ static char bin_device_buffer[sizeof(RFA_bins)]; +#endif + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>* = + nullptr> + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator + { + public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: + array data = {0}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Return a binned floating-point reference bin + inline const ftype* binned_bins(const int x) const + { +#ifdef __CUDA_ARCH__ // must be arch not CC here + return &reinterpret_cast&>(bin_device_buffer)[x]; +#else + return &reinterpret_cast&>(bin_host_buffer)[x]; +#endif + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } + } + + ///Return primary vector value ref + inline ftype& primary(int i) + { + const auto& c = *this; + return const_cast(c.primary(i)); + } + + ///Return carry vector value ref + inline ftype& carry(int i) + { + const auto& c = *this; + return const_cast(c.carry(i)); + } + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) + { + return false; + } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif + +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } +#endif + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indicies correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == 0.0) + { + return MAXINDEX; + } + else + { + frexp(x, &exp); + return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indicies correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(primary(0))) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + if (ISNANINF(primary(0))) + return; + + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) + { + const ftype* const bins = binned_bins(X_index); + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; + } + } + else + { + int shift = binned_index() - X_index; + if (shift > 0) + { +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); + } + const ftype* const bins = binned_bins(X_index); +#pragma unroll + for (int j = 0; j < FOLD; j++) + { + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + ftype x = X; + + if (ISNANINF(x) || ISNANINF(primary(0))) + { + primary(0) += x; + return; + } + + if (binned_index0()) + { + M = primary(0); + ftype qd = x * COMPRESSION; + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + primary(0) = qd; + M -= qd; + M *= EXPANSION * 0.5; + x += M; + x += M; +#pragma unroll + for (int i = 1; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); +#pragma unroll + for (int i = 0; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + inline void binned_dmrenorm(const int incpriX, const int inccarX) + { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) + return; + + for (int i = 0; i < FOLD; i++) + { + auto tmp_renormd = primary(i * incpriX); + auto& tmp_renorml = get_bits(tmp_renormd); + + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + primary(i * incpriX) = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = + max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); + Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); + Y += (primary(0) - bins[0]) * scale_down * EXPANSION; + i = 2; + } + else + { + Y += carry(0) * ((bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); + Y += + (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; + } + if (i == FOLD) + { + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += primary((i - 1) * incpriX) - bins[i - 1]; + } + Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; + } + else + { + Y += carry(0) * (bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += carry(i * inccarX) * (bins[i] / 6.0); + Y += (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0; + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carry(0) * (double) (bins[0] / 6.0) * + (double) EXPANSION; + Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carry(0) * (double) (bins[0] / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + if (ISZERO(x.primary(0))) + return; + + if (ISZERO(primary(0))) + { + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + return; + } + + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) + { + primary(0) += x.primary(0); + return; + } + + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); + //shift Y upwards and add X to Y +#pragma unroll + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); + } +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + } + else if (shift < 0) + { + const auto* const bins = binned_bins(X_index); + //shift X upwards and add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); + // add X to Y +#pragma unroll + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + constexpr int fold() const + { + return FOLD; + } + + ///Return the endurance of the binned fp + constexpr int endurance() const + { + return ENDURANCE; + } + + ///Returns the number of reference bins. Used for judging memory usage. + constexpr size_t number_of_reference_bins() + { + return array::size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (primary(0) != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same_v) + { + return binned_conv_single(1, 1); + } + else + { + return binned_conv_double(1, 1); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast(max(X, ldexp(0.5, MIN_EXP - 1)) * + ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + size_t N = last - first; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template >* = nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + return; + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template >* = nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = max(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = fmax(max_abs_val, abs_max(input[i])); + + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template >* = nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; + + +} // namespace hpx::parallel::detail::rfa \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 559ee830030e..76dc5fcd9806 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -246,3 +246,7 @@ foreach(test ${tests}) "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS} ) endforeach() + +target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address) + +target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address) \ No newline at end of file diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index c8271da6ac6b..1119c8113c6b 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include "test_utils.hpp" From 5c0129f6fa761da67da30af1283c2304736885b1 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sat, 7 Dec 2024 00:44:59 -0600 Subject: [PATCH 04/19] Remove unnecessary things from rfa - Also perform renorm and update only when necessary Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 2 +- .../parallel/algorithms/detail/rfa_cuda.hpp | 1168 ----------------- 2 files changed, 1 insertion(+), 1169 deletions(-) delete mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 7e29adce97cf..b37730889172 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp deleted file mode 100644 index 05f71d9ae746..000000000000 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp +++ /dev/null @@ -1,1168 +0,0 @@ -//Reproducible Floating Point Accumulations via Binned Floating Point -//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. -//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. -// -//The code accomplishes several objectives: -// -//1. Reproducible summation, independent of summation order, assuming only a -// subset of the IEEE 754 Floating Point Standard -// -//2. Has accuracy at least as good as conventional summation, and tunable -// -//3. Handles overflow, underflow, and other exceptions reproducibly. -// -//4. Makes only one read-only pass over the summands. -// -//5. Requires only one parallel reduction. -// -//6. Uses minimal memory (6 doubles per accumulator with fold=3). -// -//7. Relatively easy to use - -#pragma once - -#include -#include -#include -#include -#include - -#ifndef __CUDACC__ -#define __host__ -#define __device__ -#define __forceinline__ -#include -using std::array; -using std::max; -using std::min; -#else -#include -using cuda::std::array; -using cuda::std::max; -using cuda::std::min; -#include "vector.hpp" -#endif - -namespace hpx::parallel::detail::rfa { - template - struct type4 - { - F x; - F y; - F z; - F w; - }; - - template - struct type2 - { - F x; - F y; - }; - using float4 = type4; - using double4 = type4; - using float2 = type2; - using double2 = type2; - - auto abs_max(float4 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - auto z = std::abs(a.z); - auto w = std::abs(a.w); - const std::vector v = {x, y, z, w}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(double4 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - auto z = std::abs(a.z); - auto w = std::abs(a.w); - const std::vector v = {x, y, z, w}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(float2 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - const std::vector v = {x, y}; - return *std::max_element(v.begin(), v.end()); - } - - auto abs_max(double2 a) - { - auto x = std::abs(a.x); - auto y = std::abs(a.y); - const std::vector v = {x, y}; - return *std::max_element(v.begin(), v.end()); - } - -// disable zero checks -#define DISABLE_ZERO - -// disable nan / infinity checks -#define DISABLE_NANINF - -// jump table for indexing into data -#define MAX_JUMP 5 - static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); - - template - inline constexpr Real ldexp_impl(Real arg, int exp) noexcept - { - return std::ldexp(arg, exp); - // while (arg == 0) - // { - // return arg; - // } - // while (exp > 0) - // { - // arg *= static_cast(2); - // --exp; - // } - // while (exp < 0) - // { - // arg /= static_cast(2); - // ++exp; - // } - - // return arg; - } - - template - struct RFA_bins - { - static constexpr auto BIN_WIDTH = - std::is_same_v ? 40 : 13; - static constexpr auto MIN_EXP = - std::numeric_limits::min_exponent; - static constexpr auto MAX_EXP = - std::numeric_limits::max_exponent; - static constexpr auto MANT_DIG = std::numeric_limits::digits; - ///Binned floating-point maximum index - static constexpr auto MAXINDEX = - ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; - //The maximum floating-point fold supported by the library - static constexpr auto MAXFOLD = MAXINDEX + 1; - - ///The binned floating-point reference bins - array bins = {}; - - constexpr ftype& operator[](int d) - { - return bins[d]; - } - - void initialize_bins() - { - if constexpr (std::is_same_v) - { - bins[0] = std::ldexp(0.75, MAX_EXP); - } - else - { - bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1); - } - - for (int index = 1; index <= MAXINDEX; index++) - { - bins[index] = ldexp(0.75, - MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); - } - for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) - { - bins[index] = bins[index - 1]; - } - } - }; - - static char bin_host_buffer[sizeof(RFA_bins)]; -#ifdef __CUDACC__ - __constant__ static char bin_device_buffer[sizeof(RFA_bins)]; -#endif - - ///Class to hold a reproducible summation of the numbers passed to it - /// - ///@param ftype Floating-point data type; either `float` or `double - ///@param FOLD The fold; use 3 as a default unless you understand it. - template ::value>* = - nullptr> - class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator - { - public: - using ftype = ftype_; - static constexpr int FOLD = FOLD_; - - private: - array data = {0}; - - ///Floating-point precision bin width - static constexpr auto BIN_WIDTH = - std::is_same_v ? 40 : 13; - static constexpr auto MIN_EXP = - std::numeric_limits::min_exponent; - static constexpr auto MAX_EXP = - std::numeric_limits::max_exponent; - static constexpr auto MANT_DIG = std::numeric_limits::digits; - ///Binned floating-point maximum index - static constexpr auto MAXINDEX = - ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; - //The maximum floating-point fold supported by the library - static constexpr auto MAXFOLD = MAXINDEX + 1; - ///Binned floating-point compression factor - ///This factor is used to scale down inputs before deposition into the bin of - ///highest index - static constexpr auto COMPRESSION = - 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); - ///Binned double precision expansion factor - ///This factor is used to scale up inputs after deposition into the bin of - ///highest index - static constexpr auto EXPANSION = - 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); - static constexpr auto EXP_BIAS = MAX_EXP - 2; - static constexpr auto EPSILON = std::numeric_limits::epsilon(); - ///Binned floating-point deposit endurance - ///The number of deposits that can be performed before a renorm is necessary. - ///Applies also to binned complex double precision. - static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); - - ///Return a binned floating-point reference bin - inline const ftype* binned_bins(const int x) const - { -#ifdef __CUDA_ARCH__ // must be arch not CC here - return &reinterpret_cast&>(bin_device_buffer)[x]; -#else - return &reinterpret_cast&>(bin_host_buffer)[x]; -#endif - } - - ///Get the bit representation of a float - static inline uint32_t& get_bits(float& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a double - static inline uint64_t& get_bits(double& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a const float - static inline uint32_t get_bits(const float& x) - { - return *reinterpret_cast(&x); - } - ///Get the bit representation of a const double - static inline uint64_t get_bits(const double& x) - { - return *reinterpret_cast(&x); - } - - ///Return primary vector value const ref - inline const ftype& primary(int i) const - { - if constexpr (FOLD <= MAX_JUMP) - { - switch (i) - { - case 0: - if constexpr (FOLD >= 1) - return data[0]; - case 1: - if constexpr (FOLD >= 2) - return data[1]; - case 2: - if constexpr (FOLD >= 3) - return data[2]; - case 3: - if constexpr (FOLD >= 4) - return data[3]; - case 4: - if constexpr (FOLD >= 5) - return data[4]; - default: - return data[FOLD - 1]; - } - } - else - { - return data[i]; - } - } - - ///Return carry vector value const ref - inline const ftype& carry(int i) const - { - if constexpr (FOLD <= MAX_JUMP) - { - switch (i) - { - case 0: - if constexpr (FOLD >= 1) - return data[FOLD + 0]; - case 1: - if constexpr (FOLD >= 2) - return data[FOLD + 1]; - case 2: - if constexpr (FOLD >= 3) - return data[FOLD + 2]; - case 3: - if constexpr (FOLD >= 4) - return data[FOLD + 3]; - case 4: - if constexpr (FOLD >= 5) - return data[FOLD + 4]; - default: - return data[2 * FOLD - 1]; - } - } - else - { - return data[FOLD + i]; - } - } - - ///Return primary vector value ref - inline ftype& primary(int i) - { - const auto& c = *this; - return const_cast(c.primary(i)); - } - - ///Return carry vector value ref - inline ftype& carry(int i) - { - const auto& c = *this; - return const_cast(c.carry(i)); - } - -#ifdef DISABLE_ZERO - static inline constexpr bool ISZERO(const ftype) - { - return false; - } -#else - static inline constexpr bool ISZERO(const ftype x) - { - return x == 0.0; - } -#endif - -#ifdef DISABLE_NANINF - static inline constexpr int ISNANINF(const ftype) - { - return false; - } -#else - static inline constexpr int ISNANINF(const ftype x) - { - const auto bits = get_bits(x); - return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == - ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); - } -#endif - - static inline constexpr int EXP(const ftype x) - { - const auto bits = get_bits(x); - return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); - } - - ///Get index of float-point precision - ///The index of a non-binned type is the smallest index a binned type would - ///need to have to sum it reproducibly. Higher indicies correspond to smaller - ///bins. - static inline constexpr int binned_dindex(const ftype x) - { - int exp = EXP(x); - if (exp == 0) - { - if (x == 0.0) - { - return MAXINDEX; - } - else - { - frexp(x, &exp); - return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); - } - } - return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; - } - - ///Get index of manually specified binned double precision - ///The index of a binned type is the bin that it corresponds to. Higher - ///indicies correspond to smaller bins. - inline int binned_index() const - { - return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - - EXP(primary(0))) / - BIN_WIDTH; - } - - ///Check if index of manually specified binned floating-point is 0 - ///A quick check to determine if the index is 0 - inline bool binned_index0() const - { - return EXP(primary(0)) == MAX_EXP + EXP_BIAS; - } - - ///Update manually specified binned fp with a scalar (X -> Y) - /// - ///This method updates the binned fp to an index suitable for adding numbers - ///with absolute value less than @p max_abs_val - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdupdate( - const ftype max_abs_val, const int incpriY, const int inccarY) - { - if (ISNANINF(primary(0))) - return; - - int X_index = binned_dindex(max_abs_val); - if (ISZERO(primary(0))) - { - const ftype* const bins = binned_bins(X_index); - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) = bins[i]; - carry(i * inccarY) = 0.0; - } - } - else - { - int shift = binned_index() - X_index; - if (shift > 0) - { -#pragma unroll - for (int i = FOLD - 1; i >= 1; i--) - { - if (i < shift) - break; - primary(i * incpriY) = primary((i - shift) * incpriY); - carry(i * inccarY) = carry((i - shift) * inccarY); - } - const ftype* const bins = binned_bins(X_index); -#pragma unroll - for (int j = 0; j < FOLD; j++) - { - if (j >= shift) - break; - primary(j * incpriY) = bins[j]; - carry(j * inccarY) = 0.0; - } - } - } - } - - ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) - /// - ///Performs the operation Y += X on an binned type Y where the index of Y is - ///larger than the index of @p X - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - void binned_dmddeposit(const ftype X, const int incpriY) - { - ftype M; - ftype x = X; - - if (ISNANINF(x) || ISNANINF(primary(0))) - { - primary(0) += x; - return; - } - - if (binned_index0()) - { - M = primary(0); - ftype qd = x * COMPRESSION; - auto& ql = get_bits(qd); - ql |= 1; - qd += M; - primary(0) = qd; - M -= qd; - M *= EXPANSION * 0.5; - x += M; - x += M; -#pragma unroll - for (int i = 1; i < FOLD - 1; i++) - { - M = primary(i * incpriY); - qd = x; - ql |= 1; - qd += M; - primary(i * incpriY) = qd; - M -= qd; - x += M; - } - qd = x; - ql |= 1; - primary((FOLD - 1) * incpriY) += qd; - } - else - { - ftype qd = x; - auto& ql = get_bits(qd); -#pragma unroll - for (int i = 0; i < FOLD - 1; i++) - { - M = primary(i * incpriY); - qd = x; - ql |= 1; - qd += M; - primary(i * incpriY) = qd; - M -= qd; - x += M; - } - qd = x; - ql |= 1; - primary((FOLD - 1) * incpriY) += qd; - } - } - - ///Renormalize manually specified binned double precision - /// - ///Renormalization keeps the primary vector within the necessary bins by - ///shifting over to the carry vector - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - inline void binned_dmrenorm(const int incpriX, const int inccarX) - { - if (ISZERO(primary(0)) || ISNANINF(primary(0))) - return; - - for (int i = 0; i < FOLD; i++) - { - auto tmp_renormd = primary(i * incpriX); - auto& tmp_renorml = get_bits(tmp_renormd); - - carry(i * inccarX) += - (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; - - tmp_renorml &= ~(1ull << (MANT_DIG - 3)); - tmp_renorml |= 1ull << (MANT_DIG - 2); - primary(i * incpriX) = tmp_renormd; - } - } - - ///Add scalar to manually specified binned fp (Y += X) - /// - ///Performs the operation Y += X on an binned type Y - /// - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) - { - binned_dmdupdate(X, incpriY, inccarY); - binned_dmddeposit(X, incpriY); - binned_dmrenorm(incpriY, inccarY); - } - - ///Convert manually specified binned fp to native double-precision (X -> Y) - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - double binned_conv_double(const int incpriX, const int inccarX) const - { - int i = 0; - - if (ISNANINF(primary(0))) - return primary(0); - if (ISZERO(primary(0))) - return 0.0; - - double Y = 0.0; - double scale_down; - double scale_up; - int scaled; - const auto X_index = binned_index(); - const auto* const bins = binned_bins(X_index); - if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) - { - scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); - scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); - scaled = - max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); - if (X_index == 0) - { - Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION); - Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down); - Y += (primary(0) - bins[0]) * scale_down * EXPANSION; - i = 2; - } - else - { - Y += carry(0) * ((bins[0] / 6.0) * scale_down); - i = 1; - } - for (; i < scaled; i++) - { - Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down); - Y += - (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down; - } - if (i == FOLD) - { - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) * - scale_down; - return Y * scale_up; - } - if (std::isinf(Y * scale_up)) - { - return Y * scale_up; - } - Y *= scale_up; - for (; i < FOLD; i++) - { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += primary((i - 1) * incpriX) - bins[i - 1]; - } - Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1]; - } - else - { - Y += carry(0) * (bins[0] / 6.0); - for (i = 1; i < FOLD; i++) - { - Y += carry(i * inccarX) * (bins[i] / 6.0); - Y += (primary((i - 1) * incpriX) - bins[i - 1]); - } - Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); - } - return Y; - } - - ///Convert manually specified binned fp to native single-precision (X -> Y) - /// - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - float binned_conv_single(const int incpriX, const int inccarX) const - { - int i = 0; - double Y = 0.0; - - if (ISNANINF(primary(0))) - return primary(0); - if (ISZERO(primary(0))) - return 0.0; - - //Note that the following order of summation is in order of decreasing - //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and - //the number of carries equal to 1. - const auto X_index = binned_index(); - const auto* const bins = binned_bins(X_index); - if (X_index == 0) - { - Y += (double) carry(0) * (double) (bins[0] / 6.0) * - (double) EXPANSION; - Y += (double) carry(inccarX) * (double) (bins[1] / 6.0); - Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; - i = 2; - } - else - { - Y += (double) carry(0) * (double) (bins[0] / 6.0); - i = 1; - } - for (; i < FOLD; i++) - { - Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0); - Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); - } - Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); - - return (float) Y; - } - - ///Add two manually specified binned fp (Y += X) - ///Performs the operation Y += X - /// - ///@param other Another binned fp of the same type - ///@param incpriX stride within X's primary vector (use every incpriX'th element) - ///@param inccarX stride within X's carry vector (use every inccarX'th element) - ///@param incpriY stride within Y's primary vector (use every incpriY'th element) - ///@param inccarY stride within Y's carry vector (use every inccarY'th element) - void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, - const int incpriX, const int inccarX, const int incpriY, - const int inccarY) - { - if (ISZERO(x.primary(0))) - return; - - if (ISZERO(primary(0))) - { - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) = x.primary(i * incpriX); - carry(i * inccarY) = x.carry(i * inccarX); - } - return; - } - - if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) - { - primary(0) += x.primary(0); - return; - } - - const auto X_index = x.binned_index(); - const auto Y_index = this->binned_index(); - const auto shift = Y_index - X_index; - if (shift > 0) - { - const auto* const bins = binned_bins(Y_index); - //shift Y upwards and add X to Y -#pragma unroll - for (int i = FOLD - 1; i >= 1; i--) - { - if (i < shift) - break; - primary(i * incpriY) = x.primary(i * incpriX) + - (primary((i - shift) * incpriY) - bins[i - shift]); - carry(i * inccarY) = - x.carry(i * inccarX) + carry((i - shift) * inccarY); - } -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - if (i == shift) - break; - primary(i * incpriY) = x.primary(i * incpriX); - carry(i * inccarY) = x.carry(i * inccarX); - } - } - else if (shift < 0) - { - const auto* const bins = binned_bins(X_index); - //shift X upwards and add X to Y -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - if (i < -shift) - continue; - primary(i * incpriY) += - x.primary((i + shift) * incpriX) - bins[i + shift]; - carry(i * inccarY) += x.carry((i + shift) * inccarX); - } - } - else if (shift == 0) - { - const auto* const bins = binned_bins(X_index); - // add X to Y -#pragma unroll - for (int i = 0; i < FOLD; i++) - { - primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; - carry(i * inccarY) += x.carry(i * inccarX); - } - } - - binned_dmrenorm(incpriY, inccarY); - } - - ///Add two manually specified binned fp (Y += X) - ///Performs the operation Y += X - void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) - { - binned_dmdmadd(other, 1, 1, 1, 1); - } - - public: - ReproducibleFloatingAccumulator() = default; - ReproducibleFloatingAccumulator( - const ReproducibleFloatingAccumulator&) = default; - ///Sets this binned fp equal to another binned fp - ReproducibleFloatingAccumulator& operator=( - const ReproducibleFloatingAccumulator&) = default; - - ///Set the binned fp to zero - void zero() - { - data = {0}; - } - - ///Return the fold of the binned fp - constexpr int fold() const - { - return FOLD; - } - - ///Return the endurance of the binned fp - constexpr int endurance() const - { - return ENDURANCE; - } - - ///Returns the number of reference bins. Used for judging memory usage. - constexpr size_t number_of_reference_bins() - { - return array::size(); - } - - ///Accumulate an arithmetic @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator+=(const U x) - { - binned_dmdadd(static_cast(x), 1, 1); - return *this; - } - - ///Accumulate-subtract an arithmetic @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator-=(const U x) - { - binned_dmdadd(-static_cast(x), 1, 1); - return *this; - } - - ///Accumulate a binned fp @p x into the binned fp. - ReproducibleFloatingAccumulator& operator+=( - const ReproducibleFloatingAccumulator& other) - { - binned_dbdbadd(other); - return *this; - } - - ///Accumulate-subtract a binned fp @p x into the binned fp. - ///NOTE: Makes a copy and performs arithmetic; slow. - ReproducibleFloatingAccumulator& operator-=( - const ReproducibleFloatingAccumulator& other) - { - const auto temp = -other; - binned_dbdbadd(temp); - } - - ///Determines if two binned fp are equal - bool operator==(const ReproducibleFloatingAccumulator& other) const - { - return data == other.data; - } - - ///Determines if two binned fp are not equal - bool operator!=(const ReproducibleFloatingAccumulator& other) const - { - return !operator==(other); - } - - ///Sets this binned fp equal to the arithmetic value @p x - ///NOTE: Casts @p x to the type of the binned fp - template >* = nullptr> - ReproducibleFloatingAccumulator& operator=(const U x) - { - zero(); - binned_dmdadd(static_cast(x), 1, 1); - return *this; - } - - ///Returns the negative of this binned fp - ///NOTE: Makes a copy and performs arithmetic; slow. - ReproducibleFloatingAccumulator operator-() - { - constexpr int incpriX = 1; - constexpr int inccarX = 1; - ReproducibleFloatingAccumulator temp = *this; - if (primary(0) != 0.0) - { - const auto* const bins = binned_bins(binned_index()); - for (int i = 0; i < FOLD; i++) - { - temp.primary(i * incpriX) = - bins[i] - (primary(i * incpriX) - bins[i]); - temp.carry(i * inccarX) = -carry(i * inccarX); - } - } - return temp; - } - - ///Convert this binned fp into its native floating-point representation - ftype conv() const - { - if (std::is_same_v) - { - return binned_conv_single(1, 1); - } - else - { - return binned_conv_double(1, 1); - } - } - - ///@brief Get binned fp summation error bound - /// - ///This is a bound on the absolute error of a summation using binned types - /// - ///@param N The number of single precision floating point summands - ///@param max_abs_val The summand of maximum absolute value - ///@param binned_sum The value of the sum computed using binned types - ///@return The absolute error bound - static constexpr ftype error_bound( - const uint64_t N, const ftype max_abs_val, const ftype binned_sum) - { - const double X = std::abs(max_abs_val); - const double S = std::abs(binned_sum); - return static_cast(max(X, ldexp(0.5, MIN_EXP - 1)) * - ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + - ((7.0 * EPSILON) / - (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - - 7.0 * EPSILON)) * - S); - } - - ///Add @p x to the binned fp - void add(const ftype x) - { - binned_dmdadd(x, 1, 1); - } - - ///Add arithmetics in the range [first, last) to the binned fp - /// - ///@param first Start of range - ///@param last End of range - ///@param max_abs_val Maximum absolute value of any member of the range - template - void add(InputIt first, InputIt last, const ftype max_abs_val) - { - binned_dmdupdate(std::abs(max_abs_val), 1, 1); - size_t count = 0; - size_t N = last - first; - for (; first != last; first++, count++) - { - binned_dmddeposit(static_cast(*first), 1); - // first conditional allows compiler to remove the call here when possible - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - ///Add arithmetics in the range [first, last) to the binned fp - /// - ///NOTE: A maximum absolute value is calculated, so two passes are made over - /// the data - /// - ///@param first Start of range - ///@param last End of range - template - void add(InputIt first, InputIt last) - { - const auto max_abs_val = *std::max_element( - first, last, [](const auto& a, const auto& b) { - return std::abs(a) < std::abs(b); - }); - add(first, last, static_cast(max_abs_val)); - } - - ///Add @p N elements starting at @p input to the binned fp: [input, input+N) - /// - ///@param input Start of the range - ///@param N Number of elements to add - ///@param max_abs_val Maximum absolute value of any member of the range - template >* = nullptr> - void add(const T* input, const size_t N, const ftype max_abs_val) - { - if (N == 0) - return; - add(input, input + N, max_abs_val); - } - - ///Add @p N elements starting at @p input to the binned fp: [input, input+N) - /// - ///NOTE: A maximum absolute value is calculated, so two passes are made over - /// the data - /// - ///@param input Start of the range - ///@param N Number of elements to add - template >* = nullptr> - void add(const T* input, const size_t N) - { - if (N == 0) - return; - - T max_abs_val = input[0]; - for (size_t i = 0; i < N; i++) - { - max_abs_val = max(max_abs_val, std::abs(input[i])); - } - add(input, N, max_abs_val); - } - - ///Accumulate a float4 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const float4& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - binned_dmddeposit(static_cast(x.z), 1); - binned_dmddeposit(static_cast(x.w), 1); - return *this; - } - - ///Accumulate a double2 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const float2& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - return *this; - } - - ///Accumulate a double2 @p x into the binned fp. - ///NOTE: Casts @p x to the type of the binned fp - ReproducibleFloatingAccumulator& operator+=(const double2& x) - { - binned_dmdupdate(abs_max(x), 1, 1); - binned_dmddeposit(static_cast(x.x), 1); - binned_dmddeposit(static_cast(x.y), 1); - return *this; - } - - void add(const float4* input, const size_t N, float max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - binned_dmddeposit(static_cast(input[i].z), 1); - binned_dmddeposit(static_cast(input[i].w), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const double2* input, const size_t N, double max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const float2* input, const size_t N, double max_abs_val) - { - if (N == 0) - return; - binned_dmdupdate(max_abs_val, 1, 1); - - size_t count = 0; - for (size_t i = 0; i < N; i++) - { - binned_dmddeposit(static_cast(input[i].x), 1); - binned_dmddeposit(static_cast(input[i].y), 1); - - if (N > ENDURANCE && count == ENDURANCE) - { - binned_dmrenorm(1, 1); - count = 0; - } - } - } - - void add(const float4* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - void add(const double2* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - void add(const float2* input, const size_t N) - { - if (N == 0) - return; - - auto max_abs_val = abs_max(input[0]); - for (size_t i = 1; i < N; i++) - max_abs_val = fmax(max_abs_val, abs_max(input[i])); - - add(input, N, max_abs_val); - } - - ////////////////////////////////////// - //MANUAL OPERATIONS; USE WISELY - ////////////////////////////////////// - - ///Rebins for repeated accumulation of scalars with magnitude <= @p mav - /// - ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator - ///with `unsafe_add` after which `renorm()` must be called. See the source of - ///`add()` for an example - template >* = nullptr> - void set_max_abs_val(const T mav) - { - binned_dmdupdate(std::abs(mav), 1, 1); - } - - ///Add @p x to the binned fp - /// - ///This is intended to be used after a call to `set_max_abs_val()` - void unsafe_add(const ftype x) - { - binned_dmddeposit(x, 1); - } - - ///Renormalizes the binned fp - /// - ///This is intended to be used after a call to `set_max_abs_val()` and one or - ///more calls to `unsafe_add()` - void renorm() - { - binned_dmrenorm(1, 1); - } - }; - - -} // namespace hpx::parallel::detail::rfa \ No newline at end of file From 08ed522913ca9d5bbf2f94f1d5e97194484f46e0 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Mon, 9 Dec 2024 18:52:14 -0600 Subject: [PATCH 05/19] Added parallel execution of rfa reduction summation Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 93 +++++++++++++++++++ .../algorithms/reduce_deterministic.hpp | 1 + .../unit/algorithms/reduce_deterministic.cpp | 44 +++++++++ 3 files changed, 138 insertions(+) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index b37730889172..35f128a5aa4e 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,84 @@ namespace hpx::parallel::detail { } }; + template + struct sequential_reduce_deterministic_rfa_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_rfa_t> + { + private: + template + friend constexpr hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, + ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + + for (auto e = first; e != last; ++e) + { + rfa += *e; + } + return rfa; + } + + template + friend constexpr hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, + ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r) + { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + auto e = first; + for (std::size_t i = 0; i < size; ++i, ++e) + { + rfa += *e; + } + return rfa; + } + + // template , + // // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< + // // double>>::value> + // > + // friend constexpr T tag_fallback_invoke( + // sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, + // InIterE last, T init, Reduce&& r) + // { + // static_assert(hpx::util::contains, + // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< + // double>>::value); + // hpx::parallel::detail::rfa::RFA_bins bins; + // bins.initialize_bins(); + // std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + + // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + // rfa.set_max_abs_val(init); + // rfa.unsafe_add(init); + // rfa.renorm(); + // for (auto e = first; e != last; ++e) + // { + // rfa += *e; + // } + // return rfa.conv(); + // } + }; + #if !defined(HPX_COMPUTE_DEVICE_CODE) template inline constexpr sequential_reduce_deterministic_t @@ -80,4 +159,18 @@ namespace hpx::parallel::detail { } #endif +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_rfa_t + sequential_reduce_deterministic_rfa = + sequential_reduce_deterministic_rfa_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa( + Args&&... args) + { + return sequential_reduce_deterministic_rfa_t{}( + std::forward(args)...); + } +#endif } // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 5e9da5edd3ae..2bebafc41300 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -10,6 +10,7 @@ #pragma once +#include "detail/reduce_deterministic.hpp" #if defined(DOXYGEN) namespace hpx { diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 1119c8113c6b..ac495f0ec56f 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -80,6 +80,49 @@ void test_reduce1(IteratorTag) HPX_TEST_EQ(static_cast(r2), r3); } +template +void test_reduce_parallel1(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = std::vector::iterator; + using iterator_det = test::test_iterator; + + using base_iterator_ndet = std::vector::iterator; + using iterator_ndet = test::test_iterator; + + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); + + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par, + iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + // verify values + // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par, + // iterator_ndet(std::begin(nondeterministic)), + // iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(r1, r3); + // HPX_TEST_EQ(r2, r3); +} + template void test_reduce_determinism(IteratorTag) @@ -185,6 +228,7 @@ void test_reduce1() test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); test_reduce1(IteratorTag()); + test_reduce_parallel1(IteratorTag()); } template From 95cd600e90c57d09cf93162faa7d3be67cc49d5b Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 17 Dec 2024 14:22:55 -0500 Subject: [PATCH 06/19] Address inspect tool, check module cmakelists, warnings and spell check - missing includes - prevent max/min being expanded as macros - minor spell check correction - remove pragma once in cpp file - resolve implicit type conversions in rfa type to single and double and other places - add dual license - remove unnecessary command for macos ci - use HPX_UNROLL instead of vanilla pragma - clang-17 cannot unroll so use checks - add typename qualifier for iterator type Signed-off-by: Shreyas Atre --- .../algorithms/tests/unit/algorithms/reduce_deterministic.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index ac495f0ec56f..9ad604be2611 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "test_utils.hpp" @@ -89,7 +90,8 @@ void test_reduce_parallel1(IteratorTag) using base_iterator_det = std::vector::iterator; using iterator_det = test::test_iterator; - using base_iterator_ndet = std::vector::iterator; + using base_iterator_ndet = + typename std::vector::iterator; using iterator_ndet = test::test_iterator; std::vector deterministic(LEN); From 441e53566663079b161ed1655ab138b2f5dc5070 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Wed, 1 Jan 2025 21:36:17 +0530 Subject: [PATCH 07/19] Fix parallel deterministic reduce and add benchmarks Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 87 ++++------ .../algorithms/reduce_deterministic.hpp | 41 +++++ .../benchmark_reduce_deterministic.cpp | 159 ++++++++++++++++++ 3 files changed, 237 insertions(+), 50 deletions(-) create mode 100644 libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 35f128a5aa4e..ef0ac5c93a44 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -72,76 +72,63 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_rfa_t> { private: - template + template friend constexpr hpx::parallel::detail::rfa:: ReproducibleFloatingAccumulator tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, - ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r) + ExPolicy&&, InIterB first, std::size_t partition_size, T init, + std::true_type&&) { hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); - std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - - for (auto e = first; e != last; ++e) + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + size_t count = 0; + T max_val = std::abs(*first); + std::size_t partition_size_lim = 0; + for (auto e = first; partition_size_lim <= partition_size; + partition_size_lim++, e++) { - rfa += *e; + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } + rfa.unsafe_add(*e); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } } return rfa; } - template - friend constexpr hpx::parallel::detail::rfa:: - ReproducibleFloatingAccumulator - tag_fallback_invoke(sequential_reduce_deterministic_rfa_t, - ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r) + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, + std::size_t partition_size, T init, std::false_type&&) { - hpx::parallel::detail::rfa::RFA_bins bins; + hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); - std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); - hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - auto e = first; - for (std::size_t i = 0; i < size; ++i, ++e) + T rfa; + rfa += init; + std::size_t partition_size_lim = 0; + for (auto e = first; partition_size_lim <= partition_size; + partition_size_lim++, e++) { - rfa += *e; + rfa += (*e); } return rfa; } - - // template , - // // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< - // // double>>::value> - // > - // friend constexpr T tag_fallback_invoke( - // sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, - // InIterE last, T init, Reduce&& r) - // { - // static_assert(hpx::util::contains, - // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< - // double>>::value); - // hpx::parallel::detail::rfa::RFA_bins bins; - // bins.initialize_bins(); - // std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins)); - - // hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - // rfa.set_max_abs_val(init); - // rfa.unsafe_add(init); - // rfa.renorm(); - // for (auto e = first; e != last; ++e) - // { - // rfa += *e; - // } - // return rfa.conv(); - // } }; #if !defined(HPX_COMPUTE_DEVICE_CODE) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 2bebafc41300..5503a74ce76f 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -401,6 +401,47 @@ namespace hpx::parallel { ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); } + + template + static util::detail::algorithm_result_t parallel( + ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init, + Reduce&& r) + { + (void) r; + if (first == last) + { + return util::detail::algorithm_result::get( + HPX_FORWARD(T_, init)); + } + + auto f1 = [policy](FwdIterB part_begin, std::size_t part_size) + -> hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator { + T_ val = *part_begin; + return hpx::parallel::detail:: + sequential_reduce_deterministic_rfa( + HPX_FORWARD(ExPolicy, policy), ++part_begin, + --part_size, HPX_MOVE(val), + std::true_type{}); + }; + + return util::partitioner>::call(HPX_FORWARD(ExPolicy, policy), first, + detail::distance(first, last), HPX_MOVE(f1), + hpx::unwrapping([policy](auto&& results) -> T_ { + return hpx::parallel::detail:: + sequential_reduce_deterministic_rfa( + HPX_FORWARD(ExPolicy, policy), + hpx::util::begin(results), + hpx::util::size(results), + hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator{}, + std::false_type{}) + .conv(); + })); + } }; /// \endcond } // namespace detail diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp new file mode 100644 index 000000000000..5a267dd6a634 --- /dev/null +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -0,0 +1,159 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include + +#if !defined(HPX_COMPUTE_DEVICE_CODE) +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +int seed = 1000; +std::mt19937 gen(seed); + +template +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) +{ + return LO + + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); +} + +/////////////////////////////////////////////////////////////////////////////// + +void bench_reduce_deterministic(const auto& policy, + const auto& deterministic_shuffled, const auto& val_det, const auto& op) +{ + // check if different type for deterministic and nondeeterministic + // and same result + + auto r1_shuffled = + hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled), + std::end(deterministic_shuffled), val_det, op); + + HPX_UNUSED(r1_shuffled); +} + +void bench_reduce(const auto& policy, const auto& deterministic_shuffled, + const auto& val_det, const auto& op) +{ + auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)), + (std::end(deterministic_shuffled)), val_det, op); + + HPX_UNUSED(r); +} + +////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + std::srand(seed); + + auto test_count = vm["test_count"].as(); + std::size_t vector_size = vm["vector-size"].as(); + + hpx::util::perftests_init(vm); + + // verify that input is within domain of program + if (test_count == 0 || test_count < 0) + { + std::cerr << "test_count cannot be zero or negative...\n" << std::flush; + hpx::local::finalize(); + return -1; + } + + { + using FloatTypeDeterministic = float; + std::size_t LEN = vector_size; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = get_rand( + -num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = + deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + { + hpx::util::perftests_report("reduce", "seq", test_count, [&]() { + bench_reduce( + hpx::execution::seq, deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report("reduce", "par", test_count, [&]() { + bench_reduce( + hpx::execution::par, deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "reduce deterministic", "seq", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "reduce deterministic", "par", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::par, + deterministic_shuffled, val_det, op); + }); + } + + hpx::util::perftests_print_times(); + } + + return hpx::local::finalize(); +} + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char* argv[]) +{ + using namespace hpx::program_options; + + options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]"); + + // clang-format off + cmdline.add_options() + ("test_count", value()->default_value(100), + "number of tests to be averaged") + ("vector-size", value()->default_value(1000000), + "number of elements to be reduced") + ; + // clang-format on + + hpx::util::perftests_cfg(cmdline); + hpx::local::init_params init_args; + init_args.desc_cmdline = cmdline; + init_args.cfg = {"hpx.os_threads=all"}; + + return hpx::local::init(hpx_main, argc, argv, init_args); +} +#endif From b2ab51bf9fd9f975864adbdbb30937cb7752a59f Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sat, 1 Feb 2025 19:27:52 -0600 Subject: [PATCH 08/19] dbg: a negative value seeps in Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 33 ++++++++++--------- .../algorithms/reduce_deterministic.hpp | 25 +++++++++++--- .../unit/algorithms/reduce_deterministic.cpp | 33 ++++++++++--------- 3 files changed, 56 insertions(+), 35 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index ef0ac5c93a44..dc3ee96d6d51 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -36,9 +36,9 @@ namespace hpx::parallel::detail { { /// TODO: Put constraint on Reduce to be a binary plus operator (void) r; - hpx::parallel::detail::rfa::RFA_bins bins; - bins.initialize_bins(); - std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + // hpx::parallel::detail::rfa::RFA_bins bins; + // bins.initialize_bins(); + // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; rfa.set_max_abs_val(init); @@ -79,18 +79,17 @@ namespace hpx::parallel::detail { ExPolicy&&, InIterB first, std::size_t partition_size, T init, std::true_type&&) { - hpx::parallel::detail::rfa::RFA_bins bins; - bins.initialize_bins(); - std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + // hpx::parallel::detail::rfa::RFA_bins bins; + // bins.initialize_bins(); + // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; - rfa.set_max_abs_val(init); - rfa.unsafe_add(init); - rfa.renorm(); + rfa.zero(); + rfa += init; size_t count = 0; T max_val = std::abs(*first); std::size_t partition_size_lim = 0; - for (auto e = first; partition_size_lim <= partition_size; + for (auto e = first; partition_size_lim < partition_size; partition_size_lim++, e++) { T temp_max_val = std::abs(static_cast(*e)); @@ -107,7 +106,8 @@ namespace hpx::parallel::detail { count = 0; } } - return rfa; + printf("rfa res conv: %f\n", rfa.conv()); + return std::move(rfa); } template @@ -115,17 +115,20 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, std::size_t partition_size, T init, std::false_type&&) { - hpx::parallel::detail::rfa::RFA_bins bins; - bins.initialize_bins(); - std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + // hpx::parallel::detail::rfa::RFA_bins bins; + // bins.initialize_bins(); + // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); T rfa; + rfa.zero(); rfa += init; std::size_t partition_size_lim = 0; - for (auto e = first; partition_size_lim <= partition_size; + for (auto e = first; partition_size_lim < partition_size; partition_size_lim++, e++) { + printf("rfa: %f rfa val before:%f\n", (*e).conv(), rfa.conv()); rfa += (*e); + printf("rfa: %f rfa val:%f\n", (*e).conv(), rfa.conv()); } return rfa; } diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 5503a74ce76f..c508403e44eb 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -397,6 +397,10 @@ namespace hpx::parallel { static constexpr T sequential(ExPolicy&& policy, InIterB first, InIterE last, T_&& init, Reduce&& r) { + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__, + &bins, sizeof(bins)); return hpx::parallel::detail::sequential_reduce_deterministic< ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); @@ -415,6 +419,11 @@ namespace hpx::parallel { HPX_FORWARD(T_, init)); } + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__, + &bins, sizeof(bins)); + auto f1 = [policy](FwdIterB part_begin, std::size_t part_size) -> hpx::parallel::detail::rfa:: ReproducibleFloatingAccumulator { @@ -430,14 +439,22 @@ namespace hpx::parallel { hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator< T_>>::call(HPX_FORWARD(ExPolicy, policy), first, detail::distance(first, last), HPX_MOVE(f1), - hpx::unwrapping([policy](auto&& results) -> T_ { + hpx::unwrapping([policy, init](auto&& results) -> T_ { + // Assumed that + hpx::parallel::detail::rfa:: + ReproducibleFloatingAccumulator + rfa; + rfa.zero(); + rfa += init; + for (auto e : results) + { + printf("rfa results %f\n", e.conv()); + } return hpx::parallel::detail:: sequential_reduce_deterministic_rfa( HPX_FORWARD(ExPolicy, policy), hpx::util::begin(results), - hpx::util::size(results), - hpx::parallel::detail::rfa:: - ReproducibleFloatingAccumulator{}, + hpx::util::size(results), HPX_MOVE(rfa), std::false_type{}) .conv(); })); diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 9ad604be2611..01defb167887 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -87,7 +87,8 @@ void test_reduce_parallel1(IteratorTag) { // check if different type for deterministic and nondeeterministic // and same result i.e. correct computation - using base_iterator_det = std::vector::iterator; + using base_iterator_det = + typename std::vector::iterator; using iterator_det = test::test_iterator; using base_iterator_ndet = @@ -109,20 +110,20 @@ void test_reduce_parallel1(IteratorTag) return v1 + v2; }; - FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par, - iterator_det(std::begin(deterministic)), + FloatTypeDeterministic r1 = hpx::experimental::reduce_deterministic( + hpx::execution::par, iterator_det(std::begin(deterministic)), iterator_det(std::end(deterministic)), val_det, op); // verify values - // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par, - // iterator_ndet(std::begin(nondeterministic)), - // iterator_ndet(std::end(nondeterministic)), val_non_det, op); + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); HPX_TEST_EQ(r1, r3); - // HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(r2, r3); } template (IteratorTag()); - test_reduce1(IteratorTag()); - test_reduce1(IteratorTag()); - test_reduce1(IteratorTag()); + // test_reduce1(IteratorTag()); + // test_reduce1(IteratorTag()); + // test_reduce1(IteratorTag()); + // test_reduce1(IteratorTag()); test_reduce_parallel1(IteratorTag()); } @@ -238,16 +239,16 @@ void test_reduce2() { using namespace hpx::execution; - test_reduce_determinism(IteratorTag()); - test_reduce_determinism(IteratorTag()); + // test_reduce_determinism(IteratorTag()); + // test_reduce_determinism(IteratorTag()); } void reduce_test1() { test_reduce1(); - test_reduce2(); - test_reduce1(); - test_reduce2(); + // test_reduce2(); + // test_reduce1(); + // test_reduce2(); } /////////////////////////////////////////////////////////////////////////////// From 1250ec86b5862f08eaf585183c52a4381b98427e Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 13:12:34 -0600 Subject: [PATCH 09/19] fix: when comparing the max_val it does not update to accommodate a value equal to itself Signed-off-by: Shreyas Atre --- .../detail/reduce_deterministic.hpp | 25 ++++++++----------- .../algorithms/reduce_deterministic.hpp | 9 +++---- .../unit/algorithms/reduce_deterministic.cpp | 18 ++++++------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index dc3ee96d6d51..6a4f379c7d7c 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -36,16 +36,16 @@ namespace hpx::parallel::detail { { /// TODO: Put constraint on Reduce to be a binary plus operator (void) r; - // hpx::parallel::detail::rfa::RFA_bins bins; - // bins.initialize_bins(); - // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + // __rfa_bin_host_buffer__ should be initialized by the frontend of + // this method hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; rfa.set_max_abs_val(init); rfa.unsafe_add(init); rfa.renorm(); size_t count = 0; - T max_val = std::abs(*first); + T max_val = std::abs(std::numeric_limits::min()); for (auto e = first; e != last; ++e) { T temp_max_val = std::abs(static_cast(*e)); @@ -79,15 +79,14 @@ namespace hpx::parallel::detail { ExPolicy&&, InIterB first, std::size_t partition_size, T init, std::true_type&&) { - // hpx::parallel::detail::rfa::RFA_bins bins; - // bins.initialize_bins(); - // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + // __rfa_bin_host_buffer__ should be initialized by the frontend of + // this method hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; rfa.zero(); rfa += init; size_t count = 0; - T max_val = std::abs(*first); + T max_val = std::abs(std::numeric_limits::min()); std::size_t partition_size_lim = 0; for (auto e = first; partition_size_lim < partition_size; partition_size_lim++, e++) @@ -106,8 +105,7 @@ namespace hpx::parallel::detail { count = 0; } } - printf("rfa res conv: %f\n", rfa.conv()); - return std::move(rfa); + return rfa; } template @@ -115,9 +113,8 @@ namespace hpx::parallel::detail { sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first, std::size_t partition_size, T init, std::false_type&&) { - // hpx::parallel::detail::rfa::RFA_bins bins; - // bins.initialize_bins(); - // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + // __rfa_bin_host_buffer__ should be initialized by the frontend of + // this method T rfa; rfa.zero(); @@ -126,9 +123,7 @@ namespace hpx::parallel::detail { for (auto e = first; partition_size_lim < partition_size; partition_size_lim++, e++) { - printf("rfa: %f rfa val before:%f\n", (*e).conv(), rfa.conv()); rfa += (*e); - printf("rfa: %f rfa val:%f\n", (*e).conv(), rfa.conv()); } return rfa; } diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index c508403e44eb..2427338d2120 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -397,6 +397,7 @@ namespace hpx::parallel { static constexpr T sequential(ExPolicy&& policy, InIterB first, InIterE last, T_&& init, Reduce&& r) { + // TODO: abstract initializing memory hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__, @@ -419,6 +420,7 @@ namespace hpx::parallel { HPX_FORWARD(T_, init)); } + // TODO: abstract initializing memory hpx::parallel::detail::rfa::RFA_bins bins; bins.initialize_bins(); std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__, @@ -428,6 +430,7 @@ namespace hpx::parallel { -> hpx::parallel::detail::rfa:: ReproducibleFloatingAccumulator { T_ val = *part_begin; + // Assumed that __rfa_bin_host_buffer__ is initiallized return hpx::parallel::detail:: sequential_reduce_deterministic_rfa( HPX_FORWARD(ExPolicy, policy), ++part_begin, @@ -440,16 +443,12 @@ namespace hpx::parallel { T_>>::call(HPX_FORWARD(ExPolicy, policy), first, detail::distance(first, last), HPX_MOVE(f1), hpx::unwrapping([policy, init](auto&& results) -> T_ { - // Assumed that + // Assumed that __rfa_bin_host_buffer__ is initiallized hpx::parallel::detail::rfa:: ReproducibleFloatingAccumulator rfa; rfa.zero(); rfa += init; - for (auto e : results) - { - printf("rfa results %f\n", e.conv()); - } return hpx::parallel::detail:: sequential_reduce_deterministic_rfa( HPX_FORWARD(ExPolicy, policy), diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 01defb167887..6fada2ab597a 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -227,10 +227,10 @@ void test_reduce1() { using namespace hpx::execution; - // test_reduce1(IteratorTag()); - // test_reduce1(IteratorTag()); - // test_reduce1(IteratorTag()); - // test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); test_reduce_parallel1(IteratorTag()); } @@ -239,16 +239,16 @@ void test_reduce2() { using namespace hpx::execution; - // test_reduce_determinism(IteratorTag()); - // test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); } void reduce_test1() { test_reduce1(); - // test_reduce2(); - // test_reduce1(); - // test_reduce2(); + test_reduce2(); + test_reduce1(); + test_reduce2(); } /////////////////////////////////////////////////////////////////////////////// From 2e8eea15aaabb96d840a2f841fd29aa83b9b4039 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 13:51:03 -0600 Subject: [PATCH 10/19] Add benchmark entry to CMakeLists Signed-off-by: Shreyas Atre --- .../tests/performance/CMakeLists.txt | 1 + .../benchmark_reduce_deterministic.cpp | 23 +++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/libs/core/algorithms/tests/performance/CMakeLists.txt b/libs/core/algorithms/tests/performance/CMakeLists.txt index d74788a9b47f..96ce826dc742 100644 --- a/libs/core/algorithms/tests/performance/CMakeLists.txt +++ b/libs/core/algorithms/tests/performance/CMakeLists.txt @@ -16,6 +16,7 @@ set(benchmarks benchmark_partial_sort_parallel benchmark_partition benchmark_partition_copy + benchmark_reduce_deterministic benchmark_remove benchmark_remove_if benchmark_scan_algorithms diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp index 5a267dd6a634..c6604aaea44d 100644 --- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -33,25 +33,28 @@ T get_rand(T LO = (std::numeric_limits::min)(), } /////////////////////////////////////////////////////////////////////////////// - -void bench_reduce_deterministic(const auto& policy, - const auto& deterministic_shuffled, const auto& val_det, const auto& op) +template +void bench_reduce_deterministic(const PolicyT& policy, + const IteratorT& deterministic_shuffled, const InitVal& val_det, + const Op& op) { // check if different type for deterministic and nondeeterministic // and same result - auto r1_shuffled = - hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled), - std::end(deterministic_shuffled), val_det, op); + auto r1_shuffled = hpx::experimental::reduce_deterministic(policy, + std::begin(deterministic_shuffled), std::end(deterministic_shuffled), + val_det, op); HPX_UNUSED(r1_shuffled); } -void bench_reduce(const auto& policy, const auto& deterministic_shuffled, - const auto& val_det, const auto& op) +template +void bench_reduce(const PolicyT& policy, + const IteratorT& non_deterministic_shuffled, const InitVal& val_det, + const Op& op) { - auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)), - (std::end(deterministic_shuffled)), val_det, op); + auto r = hpx::reduce(policy, (std::begin(non_deterministic_shuffled)), + (std::end(non_deterministic_shuffled)), val_det, op); HPX_UNUSED(r); } From bedbf56c36b32ef2598e1a003ec89497c90f7306 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 14:01:15 -0600 Subject: [PATCH 11/19] fix: missing braces around initializer Signed-off-by: Shreyas Atre --- .../algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp index b8f5da5f233a..77e43fd5894b 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -824,7 +824,7 @@ namespace hpx::parallel::detail::rfa { ///Set the binned fp to zero void zero() { - data = {0}; + data = {{0}}; } ///Return the fold of the binned fp From ca73d3d351bd794fde7b2c398b3d66e8bf84e004 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 16:02:17 -0600 Subject: [PATCH 12/19] Remove unnecessary include Signed-off-by: Shreyas Atre --- .../include/hpx/parallel/algorithms/reduce_deterministic.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 2427338d2120..56135161495f 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -10,7 +10,6 @@ #pragma once -#include "detail/reduce_deterministic.hpp" #if defined(DOXYGEN) namespace hpx { From aba736e03edb39d747bec839904e40b86383debd Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 16:04:02 -0600 Subject: [PATCH 13/19] Remove sanitizer added for testing purpose Signed-off-by: Shreyas Atre --- libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 76dc5fcd9806..559ee830030e 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -246,7 +246,3 @@ foreach(test ${tests}) "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS} ) endforeach() - -target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address) - -target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address) \ No newline at end of file From 6011678d0335649e79f53b0d20c74ec3b3d5ab8c Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 16:06:22 -0600 Subject: [PATCH 14/19] Remove unnecessary overload added Signed-off-by: Shreyas Atre --- libs/core/debugging/src/print.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp index 8a01d9574853..3d7cf5da2aa0 100644 --- a/libs/core/debugging/src/print.cpp +++ b/libs/core/debugging/src/print.cpp @@ -57,10 +57,6 @@ namespace hpx::debug { std::ostream&, std::int32_t const&, int); template HPX_CORE_EXPORT void print_dec( std::ostream&, std::int64_t const&, int); -#ifdef __APPLE__ - template HPX_CORE_EXPORT void print_dec( - std::ostream&, unsigned long const&, int); -#endif template HPX_CORE_EXPORT void print_dec( std::ostream&, std::uint64_t const&, int); From 21339d5d3f72f2df37d30974fcba00b61caa75f9 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 16:06:22 -0600 Subject: [PATCH 15/19] Remove unnecessary type cast and overload Signed-off-by: Shreyas Atre --- libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp index d86fc5775415..b29652a3ede1 100644 --- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp +++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp @@ -25,7 +25,7 @@ void tagged_ptr_test() i = j; HPX_TEST_EQ(i.get_ptr(), &b); - HPX_TEST_EQ(i.get_tag(), 1UL); + HPX_TEST_EQ(i.get_tag(), 1); } { @@ -43,7 +43,7 @@ void tagged_ptr_test() { tagged_ptr j(&a, max_tag); - HPX_TEST_EQ(j.get_next_tag(), 0UL); + HPX_TEST_EQ(j.get_next_tag(), 0); } { From 5af23aec6f23a636be62922e59da9807444c2729 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Sun, 2 Feb 2025 16:14:28 -0600 Subject: [PATCH 16/19] Address clang-format modifications Signed-off-by: Shreyas Atre --- .../hpx/parallel/algorithms/detail/reduce_deterministic.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 6a4f379c7d7c..3fc0ec2e4344 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -89,7 +89,7 @@ namespace hpx::parallel::detail { T max_val = std::abs(std::numeric_limits::min()); std::size_t partition_size_lim = 0; for (auto e = first; partition_size_lim < partition_size; - partition_size_lim++, e++) + partition_size_lim++, e++) { T temp_max_val = std::abs(static_cast(*e)); if (max_val < temp_max_val) @@ -121,7 +121,7 @@ namespace hpx::parallel::detail { rfa += init; std::size_t partition_size_lim = 0; for (auto e = first; partition_size_lim < partition_size; - partition_size_lim++, e++) + partition_size_lim++, e++) { rfa += (*e); } From 1c85d5d6a2f2a3ce5a2837d587478fff5bf83fec Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 4 Feb 2025 21:56:30 -0600 Subject: [PATCH 17/19] Address inspect tool Signed-off-by: Shreyas Atre --- .../hpx/parallel/algorithms/detail/reduce_deterministic.hpp | 4 ++-- .../include/hpx/parallel/algorithms/reduce_deterministic.hpp | 1 + .../tests/performance/benchmark_reduce_deterministic.cpp | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp index 3fc0ec2e4344..2c2aeabf4ce6 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -45,7 +45,7 @@ namespace hpx::parallel::detail { rfa.unsafe_add(init); rfa.renorm(); size_t count = 0; - T max_val = std::abs(std::numeric_limits::min()); + T max_val = std::abs((std::numeric_limits::min)()); for (auto e = first; e != last; ++e) { T temp_max_val = std::abs(static_cast(*e)); @@ -86,7 +86,7 @@ namespace hpx::parallel::detail { rfa.zero(); rfa += init; size_t count = 0; - T max_val = std::abs(std::numeric_limits::min()); + T max_val = std::abs((std::numeric_limits::min)()); std::size_t partition_size_lim = 0; for (auto e = first; partition_size_lim < partition_size; partition_size_lim++, e++) diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp index 56135161495f..fddfd2bff1a3 100644 --- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -371,6 +371,7 @@ namespace hpx { #include #include +#include #include #include #include diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp index c6604aaea44d..a91387234830 100644 --- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include From 04af5ec0994c64e785a9c29211895a771c8189c2 Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 4 Feb 2025 23:36:39 -0600 Subject: [PATCH 18/19] Add double and float separate benchmarks, more tests Signed-off-by: Shreyas Atre --- .../benchmark_reduce_deterministic.cpp | 155 ++++++++++++------ .../unit/algorithms/reduce_deterministic.cpp | 7 +- 2 files changed, 113 insertions(+), 49 deletions(-) diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp index a91387234830..f2315f988750 100644 --- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp @@ -79,56 +79,117 @@ int hpx_main(hpx::program_options::variables_map& vm) } { - using FloatTypeDeterministic = float; - std::size_t LEN = vector_size; - - constexpr FloatTypeDeterministic num_bounds_det = - std::is_same_v ? 1000.0 : 1000000.0; - - std::vector deterministic(LEN); - - for (size_t i = 0; i < LEN; ++i) - { - deterministic[i] = get_rand( - -num_bounds_det, num_bounds_det); - } - - std::vector deterministic_shuffled = - deterministic; - - std::shuffle( - deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); - - FloatTypeDeterministic val_det(41.999); - - auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { - return v1 + v2; - }; - { - hpx::util::perftests_report("reduce", "seq", test_count, [&]() { - bench_reduce( - hpx::execution::seq, deterministic_shuffled, val_det, op); - }); - } - { - hpx::util::perftests_report("reduce", "par", test_count, [&]() { - bench_reduce( - hpx::execution::par, deterministic_shuffled, val_det, op); - }); - } { - hpx::util::perftests_report( - "reduce deterministic", "seq", test_count, [&]() { - bench_reduce_deterministic(hpx::execution::seq, - deterministic_shuffled, val_det, op); - }); + using FloatTypeDeterministic = float; + std::size_t LEN = vector_size; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : + 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = get_rand( + -num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = + deterministic; + + std::shuffle(deterministic_shuffled.begin(), + deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + { + hpx::util::perftests_report( + "fl reduce", "seq", test_count, [&]() { + bench_reduce(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "fl reduce", "par", test_count, [&]() { + bench_reduce(hpx::execution::par, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "fl reduce deterministic", "seq", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "fl reduce deterministic", "par", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::par, + deterministic_shuffled, val_det, op); + }); + } } { - hpx::util::perftests_report( - "reduce deterministic", "par", test_count, [&]() { - bench_reduce_deterministic(hpx::execution::par, - deterministic_shuffled, val_det, op); - }); + using FloatTypeDeterministic = double; + std::size_t LEN = vector_size; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : + 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = get_rand( + -num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = + deterministic; + + std::shuffle(deterministic_shuffled.begin(), + deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + { + hpx::util::perftests_report( + "dbl reduce", "seq", test_count, [&]() { + bench_reduce(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "dbl reduce", "par", test_count, [&]() { + bench_reduce(hpx::execution::par, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "dbl reduce deterministic", "seq", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::seq, + deterministic_shuffled, val_det, op); + }); + } + { + hpx::util::perftests_report( + "dbl reduce deterministic", "par", test_count, [&]() { + bench_reduce_deterministic(hpx::execution::par, + deterministic_shuffled, val_det, op); + }); + } } hpx::util::perftests_print_times(); diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp index 6fada2ab597a..5dcf41f4d9c5 100644 --- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -122,8 +122,8 @@ void test_reduce_parallel1(IteratorTag) FloatTypeNonDeterministic r3 = std::accumulate( nondeterministic.begin(), nondeterministic.end(), val_non_det); - HPX_TEST_EQ(r1, r3); - HPX_TEST_EQ(r2, r3); + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); } template (IteratorTag()); test_reduce1(IteratorTag()); test_reduce_parallel1(IteratorTag()); + test_reduce_parallel1(IteratorTag()); + test_reduce_parallel1(IteratorTag()); + test_reduce_parallel1(IteratorTag()); } template From 2319abe9915a4f151f11806ec0863ff5f47eee6b Mon Sep 17 00:00:00 2001 From: Shreyas Atre Date: Tue, 11 Feb 2025 19:23:59 -0600 Subject: [PATCH 19/19] Fix macos CI Signed-off-by: Shreyas Atre --- .github/workflows/macos_debug_fetch_hwloc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index c1a54b4186e4..7b0044941751 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -36,6 +36,7 @@ jobs: -DHPX_WITH_VERIFY_LOCKS=ON \ -DHPX_WITH_VERIFY_LOCKS_BACKTRACE=ON \ -DHPX_WITH_CHECK_MODULE_DEPENDENCIES=ON + ln -s "$(which aclocal)" /opt/homebrew/bin/aclocal-1.16 cd build/_deps/hwloc-src/ && autoreconf -f -i - name: Build shell: bash