From 77e6ddfc0050f501c5635ebb86d3db504ba40207 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:30:59 -0600
Subject: [PATCH 01/19] [macOS] Comparison between exactly same types

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index b29652a3ede1..d86fc5775415 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
         i = j;
 
         HPX_TEST_EQ(i.get_ptr(), &b);
-        HPX_TEST_EQ(i.get_tag(), 1);
+        HPX_TEST_EQ(i.get_tag(), 1UL);
     }
 
     {
@@ -43,7 +43,7 @@ void tagged_ptr_test()
 
     {
         tagged_ptr<int> j(&a, max_tag);
-        HPX_TEST_EQ(j.get_next_tag(), 0);
+        HPX_TEST_EQ(j.get_next_tag(), 0UL);
     }
 
     {

From 0a0fec58616db5e149b082689dd65ce2e813c078 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 23 Jan 2024 11:32:20 -0600
Subject: [PATCH 02/19] [macOS] Apple does not seem to have any typedef for
 unsigned long int

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/debugging/src/print.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 3d7cf5da2aa0..8a01d9574853 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,6 +57,10 @@ namespace hpx::debug {
             std::ostream&, std::int32_t const&, int);
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::int64_t const&, int);
+#ifdef __APPLE__
+        template HPX_CORE_EXPORT void print_dec(
+            std::ostream&, unsigned long const&, int);
+#endif
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::uint64_t const&, int);
 

From b68fc468d4019ac12cd5d5a60dc3ff18a8099db0 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Wed, 4 Dec 2024 00:51:55 -0600
Subject: [PATCH 03/19] Fix some issues and add tests for determinism

- Add Kate's CUDA impl for RFA

TODO:
- Use original RFA instead of Kate's
- Make a parallel version out of it
- Make a partition vector suitable version

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |    2 +-
 .../hpx/parallel/algorithms/detail/rfa.hpp    |    1 -
 .../parallel/algorithms/detail/rfa_cuda.hpp   | 1168 +++++++++++++++++
 .../tests/unit/algorithms/CMakeLists.txt      |    4 +
 .../unit/algorithms/reduce_deterministic.cpp  |    1 -
 5 files changed, 1173 insertions(+), 3 deletions(-)
 create mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index b37730889172..7e29adce97cf 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -9,7 +9,7 @@
 #include <hpx/config.hpp>
 #include <hpx/functional/detail/tag_fallback_invoke.hpp>
 #include <hpx/functional/invoke.hpp>
-#include <hpx/parallel/algorithms/detail/rfa.hpp>
+#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index 2c72f10bfb91..b8f5da5f233a 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -245,7 +245,6 @@ namespace hpx::parallel::detail::rfa {
         ///The number of deposits that can be performed before a renorm is necessary.
         ///Applies also to binned complex double precision.
         static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-
         ///Return a binned floating-point reference bin
         inline const ftype* binned_bins(const int x) const
         {
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
new file mode 100644
index 000000000000..05f71d9ae746
--- /dev/null
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
@@ -0,0 +1,1168 @@
+//Reproducible Floating Point Accumulations via Binned Floating Point
+//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
+//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
+//
+//The code accomplishes several objectives:
+//
+//1. Reproducible summation, independent of summation order, assuming only a
+//   subset of the IEEE 754 Floating Point Standard
+//
+//2. Has accuracy at least as good as conventional summation, and tunable
+//
+//3. Handles overflow, underflow, and other exceptions reproducibly.
+//
+//4. Makes only one read-only pass over the summands.
+//
+//5. Requires only one parallel reduction.
+//
+//6. Uses minimal memory (6 doubles per accumulator with fold=3).
+//
+//7. Relatively easy to use
+
+#pragma once
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#ifndef __CUDACC__
+#define __host__
+#define __device__
+#define __forceinline__
+#include <array>
+using std::array;
+using std::max;
+using std::min;
+#else
+#include <cuda/std/array>
+using cuda::std::array;
+using cuda::std::max;
+using cuda::std::min;
+#include "vector.hpp"
+#endif
+
+namespace hpx::parallel::detail::rfa {
+    template <typename F>
+    struct type4
+    {
+        F x;
+        F y;
+        F z;
+        F w;
+    };
+
+    template <typename F>
+    struct type2
+    {
+        F x;
+        F y;
+    };
+    using float4 = type4<float>;
+    using double4 = type4<double>;
+    using float2 = type2<float>;
+    using double2 = type2<double>;
+
+    auto abs_max(float4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<float> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double4 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        auto z = std::abs(a.z);
+        auto w = std::abs(a.w);
+        const std::vector<double> v = {x, y, z, w};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(float2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<float> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+    auto abs_max(double2 a)
+    {
+        auto x = std::abs(a.x);
+        auto y = std::abs(a.y);
+        const std::vector<double> v = {x, y};
+        return *std::max_element(v.begin(), v.end());
+    }
+
+// disable zero checks
+#define DISABLE_ZERO
+
+// disable nan / infinity checks
+#define DISABLE_NANINF
+
+// jump table for indexing into data
+#define MAX_JUMP 5
+    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
+
+    template <typename Real>
+    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
+    {
+        return std::ldexp(arg, exp);
+        // while (arg == 0)
+        // {
+        //     return arg;
+        // }
+        // while (exp > 0)
+        // {
+        //     arg *= static_cast<Real>(2);
+        //     --exp;
+        // }
+        // while (exp < 0)
+        // {
+        //     arg /= static_cast<Real>(2);
+        //     ++exp;
+        // }
+
+        // return arg;
+    }
+
+    template <class ftype>
+    struct RFA_bins
+    {
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+
+        ///The binned floating-point reference bins
+        array<ftype, MAXINDEX + MAXFOLD> bins = {};
+
+        constexpr ftype& operator[](int d)
+        {
+            return bins[d];
+        }
+
+        void initialize_bins()
+        {
+            if constexpr (std::is_same_v<ftype, float>)
+            {
+                bins[0] = std::ldexp(0.75, MAX_EXP);
+            }
+            else
+            {
+                bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1);
+            }
+
+            for (int index = 1; index <= MAXINDEX; index++)
+            {
+                bins[index] = ldexp(0.75,
+                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
+            }
+            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
+            {
+                bins[index] = bins[index - 1];
+            }
+        }
+    };
+
+    static char bin_host_buffer[sizeof(RFA_bins<double>)];
+#ifdef __CUDACC__
+    __constant__ static char bin_device_buffer[sizeof(RFA_bins<double>)];
+#endif
+
+    ///Class to hold a reproducible summation of the numbers passed to it
+    ///
+    ///@param ftype Floating-point data type; either `float` or `double
+    ///@param FOLD  The fold; use 3 as a default unless you understand it.
+    template <class ftype_, int FOLD_ = 3,
+        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
+            nullptr>
+    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
+    {
+    public:
+        using ftype = ftype_;
+        static constexpr int FOLD = FOLD_;
+
+    private:
+        array<ftype, 2 * FOLD> data = {0};
+
+        ///Floating-point precision bin width
+        static constexpr auto BIN_WIDTH =
+            std::is_same_v<ftype, double> ? 40 : 13;
+        static constexpr auto MIN_EXP =
+            std::numeric_limits<ftype>::min_exponent;
+        static constexpr auto MAX_EXP =
+            std::numeric_limits<ftype>::max_exponent;
+        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
+        ///Binned floating-point maximum index
+        static constexpr auto MAXINDEX =
+            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
+        //The maximum floating-point fold supported by the library
+        static constexpr auto MAXFOLD = MAXINDEX + 1;
+        ///Binned floating-point compression factor
+        ///This factor is used to scale down inputs before deposition into the bin of
+        ///highest index
+        static constexpr auto COMPRESSION =
+            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
+        ///Binned double precision expansion factor
+        ///This factor is used to scale up inputs after deposition into the bin of
+        ///highest index
+        static constexpr auto EXPANSION =
+            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
+        static constexpr auto EXP_BIAS = MAX_EXP - 2;
+        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
+        ///Binned floating-point deposit endurance
+        ///The number of deposits that can be performed before a renorm is necessary.
+        ///Applies also to binned complex double precision.
+        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
+
+        ///Return a binned floating-point reference bin
+        inline const ftype* binned_bins(const int x) const
+        {
+#ifdef __CUDA_ARCH__    // must be arch not CC here
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_device_buffer)[x];
+#else
+            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
+#endif
+        }
+
+        ///Get the bit representation of a float
+        static inline uint32_t& get_bits(float& x)
+        {
+            return *reinterpret_cast<uint32_t*>(&x);
+        }
+        ///Get the bit representation of a double
+        static inline uint64_t& get_bits(double& x)
+        {
+            return *reinterpret_cast<uint64_t*>(&x);
+        }
+        ///Get the bit representation of a const float
+        static inline uint32_t get_bits(const float& x)
+        {
+            return *reinterpret_cast<const uint32_t*>(&x);
+        }
+        ///Get the bit representation of a const double
+        static inline uint64_t get_bits(const double& x)
+        {
+            return *reinterpret_cast<const uint64_t*>(&x);
+        }
+
+        ///Return primary vector value const ref
+        inline const ftype& primary(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[4];
+                default:
+                    return data[FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[i];
+            }
+        }
+
+        ///Return carry vector value const ref
+        inline const ftype& carry(int i) const
+        {
+            if constexpr (FOLD <= MAX_JUMP)
+            {
+                switch (i)
+                {
+                case 0:
+                    if constexpr (FOLD >= 1)
+                        return data[FOLD + 0];
+                case 1:
+                    if constexpr (FOLD >= 2)
+                        return data[FOLD + 1];
+                case 2:
+                    if constexpr (FOLD >= 3)
+                        return data[FOLD + 2];
+                case 3:
+                    if constexpr (FOLD >= 4)
+                        return data[FOLD + 3];
+                case 4:
+                    if constexpr (FOLD >= 5)
+                        return data[FOLD + 4];
+                default:
+                    return data[2 * FOLD - 1];
+                }
+            }
+            else
+            {
+                return data[FOLD + i];
+            }
+        }
+
+        ///Return primary vector value ref
+        inline ftype& primary(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.primary(i));
+        }
+
+        ///Return carry vector value ref
+        inline ftype& carry(int i)
+        {
+            const auto& c = *this;
+            return const_cast<ftype&>(c.carry(i));
+        }
+
+#ifdef DISABLE_ZERO
+        static inline constexpr bool ISZERO(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr bool ISZERO(const ftype x)
+        {
+            return x == 0.0;
+        }
+#endif
+
+#ifdef DISABLE_NANINF
+        static inline constexpr int ISNANINF(const ftype)
+        {
+            return false;
+        }
+#else
+        static inline constexpr int ISNANINF(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
+                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
+        }
+#endif
+
+        static inline constexpr int EXP(const ftype x)
+        {
+            const auto bits = get_bits(x);
+            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
+        }
+
+        ///Get index of float-point precision
+        ///The index of a non-binned type is the smallest index a binned type would
+        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
+        ///bins.
+        static inline constexpr int binned_dindex(const ftype x)
+        {
+            int exp = EXP(x);
+            if (exp == 0)
+            {
+                if (x == 0.0)
+                {
+                    return MAXINDEX;
+                }
+                else
+                {
+                    frexp(x, &exp);
+                    return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
+                }
+            }
+            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
+        }
+
+        ///Get index of manually specified binned double precision
+        ///The index of a binned type is the bin that it corresponds to. Higher
+        ///indicies correspond to smaller bins.
+        inline int binned_index() const
+        {
+            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
+                       EXP(primary(0))) /
+                BIN_WIDTH;
+        }
+
+        ///Check if index of manually specified binned floating-point is 0
+        ///A quick check to determine if the index is 0
+        inline bool binned_index0() const
+        {
+            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
+        }
+
+        ///Update manually specified binned fp with a scalar (X -> Y)
+        ///
+        ///This method updates the binned fp to an index suitable for adding numbers
+        ///with absolute value less than @p max_abs_val
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdupdate(
+            const ftype max_abs_val, const int incpriY, const int inccarY)
+        {
+            if (ISNANINF(primary(0)))
+                return;
+
+            int X_index = binned_dindex(max_abs_val);
+            if (ISZERO(primary(0)))
+            {
+                const ftype* const bins = binned_bins(X_index);
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = bins[i];
+                    carry(i * inccarY) = 0.0;
+                }
+            }
+            else
+            {
+                int shift = binned_index() - X_index;
+                if (shift > 0)
+                {
+#pragma unroll
+                    for (int i = FOLD - 1; i >= 1; i--)
+                    {
+                        if (i < shift)
+                            break;
+                        primary(i * incpriY) = primary((i - shift) * incpriY);
+                        carry(i * inccarY) = carry((i - shift) * inccarY);
+                    }
+                    const ftype* const bins = binned_bins(X_index);
+#pragma unroll
+                    for (int j = 0; j < FOLD; j++)
+                    {
+                        if (j >= shift)
+                            break;
+                        primary(j * incpriY) = bins[j];
+                        carry(j * inccarY) = 0.0;
+                    }
+                }
+            }
+        }
+
+        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y where the index of Y is
+        ///larger than the index of @p X
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        void binned_dmddeposit(const ftype X, const int incpriY)
+        {
+            ftype M;
+            ftype x = X;
+
+            if (ISNANINF(x) || ISNANINF(primary(0)))
+            {
+                primary(0) += x;
+                return;
+            }
+
+            if (binned_index0())
+            {
+                M = primary(0);
+                ftype qd = x * COMPRESSION;
+                auto& ql = get_bits(qd);
+                ql |= 1;
+                qd += M;
+                primary(0) = qd;
+                M -= qd;
+                M *= EXPANSION * 0.5;
+                x += M;
+                x += M;
+#pragma unroll
+                for (int i = 1; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+            else
+            {
+                ftype qd = x;
+                auto& ql = get_bits(qd);
+#pragma unroll
+                for (int i = 0; i < FOLD - 1; i++)
+                {
+                    M = primary(i * incpriY);
+                    qd = x;
+                    ql |= 1;
+                    qd += M;
+                    primary(i * incpriY) = qd;
+                    M -= qd;
+                    x += M;
+                }
+                qd = x;
+                ql |= 1;
+                primary((FOLD - 1) * incpriY) += qd;
+            }
+        }
+
+        ///Renormalize manually specified binned double precision
+        ///
+        ///Renormalization keeps the primary vector within the necessary bins by
+        ///shifting over to the carry vector
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        inline void binned_dmrenorm(const int incpriX, const int inccarX)
+        {
+            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
+                return;
+
+            for (int i = 0; i < FOLD; i++)
+            {
+                auto tmp_renormd = primary(i * incpriX);
+                auto& tmp_renorml = get_bits(tmp_renormd);
+
+                carry(i * inccarX) +=
+                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
+
+                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
+                tmp_renorml |= 1ull << (MANT_DIG - 2);
+                primary(i * incpriX) = tmp_renormd;
+            }
+        }
+
+        ///Add scalar to manually specified binned fp (Y += X)
+        ///
+        ///Performs the operation Y += X on an binned type Y
+        ///
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
+        {
+            binned_dmdupdate(X, incpriY, inccarY);
+            binned_dmddeposit(X, incpriY);
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Convert manually specified binned fp to native double-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        double binned_conv_double(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            double Y = 0.0;
+            double scale_down;
+            double scale_up;
+            int scaled;
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
+            {
+                scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
+                scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
+                scaled =
+                    max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
+                if (X_index == 0)
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
+                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
+                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
+                    i = 2;
+                }
+                else
+                {
+                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
+                    i = 1;
+                }
+                for (; i < scaled; i++)
+                {
+                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
+                    Y +=
+                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
+                }
+                if (i == FOLD)
+                {
+                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
+                        scale_down;
+                    return Y * scale_up;
+                }
+                if (std::isinf(Y * scale_up))
+                {
+                    return Y * scale_up;
+                }
+                Y *= scale_up;
+                for (; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += primary((i - 1) * incpriX) - bins[i - 1];
+                }
+                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
+            }
+            else
+            {
+                Y += carry(0) * (bins[0] / 6.0);
+                for (i = 1; i < FOLD; i++)
+                {
+                    Y += carry(i * inccarX) * (bins[i] / 6.0);
+                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
+                }
+                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+            }
+            return Y;
+        }
+
+        ///Convert manually specified binned fp to native single-precision (X -> Y)
+        ///
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        float binned_conv_single(const int incpriX, const int inccarX) const
+        {
+            int i = 0;
+            double Y = 0.0;
+
+            if (ISNANINF(primary(0)))
+                return primary(0);
+            if (ISZERO(primary(0)))
+                return 0.0;
+
+            //Note that the following order of summation is in order of decreasing
+            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
+            //the number of carries equal to 1.
+            const auto X_index = binned_index();
+            const auto* const bins = binned_bins(X_index);
+            if (X_index == 0)
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
+                    (double) EXPANSION;
+                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
+                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
+                i = 2;
+            }
+            else
+            {
+                Y += (double) carry(0) * (double) (bins[0] / 6.0);
+                i = 1;
+            }
+            for (; i < FOLD; i++)
+            {
+                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
+                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
+            }
+            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
+
+            return (float) Y;
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        ///
+        ///@param other   Another binned fp of the same type
+        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
+        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
+        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
+        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
+        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
+            const int incpriX, const int inccarX, const int incpriY,
+            const int inccarY)
+        {
+            if (ISZERO(x.primary(0)))
+                return;
+
+            if (ISZERO(primary(0)))
+            {
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+                return;
+            }
+
+            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
+            {
+                primary(0) += x.primary(0);
+                return;
+            }
+
+            const auto X_index = x.binned_index();
+            const auto Y_index = this->binned_index();
+            const auto shift = Y_index - X_index;
+            if (shift > 0)
+            {
+                const auto* const bins = binned_bins(Y_index);
+                //shift Y upwards and add X to Y
+#pragma unroll
+                for (int i = FOLD - 1; i >= 1; i--)
+                {
+                    if (i < shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX) +
+                        (primary((i - shift) * incpriY) - bins[i - shift]);
+                    carry(i * inccarY) =
+                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
+                }
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i == shift)
+                        break;
+                    primary(i * incpriY) = x.primary(i * incpriX);
+                    carry(i * inccarY) = x.carry(i * inccarX);
+                }
+            }
+            else if (shift < 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                //shift X upwards and add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    if (i < -shift)
+                        continue;
+                    primary(i * incpriY) +=
+                        x.primary((i + shift) * incpriX) - bins[i + shift];
+                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
+                }
+            }
+            else if (shift == 0)
+            {
+                const auto* const bins = binned_bins(X_index);
+                // add X to Y
+#pragma unroll
+                for (int i = 0; i < FOLD; i++)
+                {
+                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
+                    carry(i * inccarY) += x.carry(i * inccarX);
+                }
+            }
+
+            binned_dmrenorm(incpriY, inccarY);
+        }
+
+        ///Add two manually specified binned fp (Y += X)
+        ///Performs the operation Y += X
+        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dmdmadd(other, 1, 1, 1, 1);
+        }
+
+    public:
+        ReproducibleFloatingAccumulator() = default;
+        ReproducibleFloatingAccumulator(
+            const ReproducibleFloatingAccumulator&) = default;
+        ///Sets this binned fp equal to another binned fp
+        ReproducibleFloatingAccumulator& operator=(
+            const ReproducibleFloatingAccumulator&) = default;
+
+        ///Set the binned fp to zero
+        void zero()
+        {
+            data = {0};
+        }
+
+        ///Return the fold of the binned fp
+        constexpr int fold() const
+        {
+            return FOLD;
+        }
+
+        ///Return the endurance of the binned fp
+        constexpr int endurance() const
+        {
+            return ENDURANCE;
+        }
+
+        ///Returns the number of reference bins. Used for judging memory usage.
+        constexpr size_t number_of_reference_bins()
+        {
+            return array<ftype, MAXINDEX + MAXFOLD>::size();
+        }
+
+        ///Accumulate an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator+=(const U x)
+        {
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate-subtract an arithmetic @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator-=(const U x)
+        {
+            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Accumulate a binned fp @p x into the binned fp.
+        ReproducibleFloatingAccumulator& operator+=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            binned_dbdbadd(other);
+            return *this;
+        }
+
+        ///Accumulate-subtract a binned fp @p x into the binned fp.
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator& operator-=(
+            const ReproducibleFloatingAccumulator& other)
+        {
+            const auto temp = -other;
+            binned_dbdbadd(temp);
+        }
+
+        ///Determines if two binned fp are equal
+        bool operator==(const ReproducibleFloatingAccumulator& other) const
+        {
+            return data == other.data;
+        }
+
+        ///Determines if two binned fp are not equal
+        bool operator!=(const ReproducibleFloatingAccumulator& other) const
+        {
+            return !operator==(other);
+        }
+
+        ///Sets this binned fp equal to the arithmetic value @p x
+        ///NOTE: Casts @p x to the type of the binned fp
+        template <typename U,
+            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
+        ReproducibleFloatingAccumulator& operator=(const U x)
+        {
+            zero();
+            binned_dmdadd(static_cast<ftype>(x), 1, 1);
+            return *this;
+        }
+
+        ///Returns the negative of this binned fp
+        ///NOTE: Makes a copy and performs arithmetic; slow.
+        ReproducibleFloatingAccumulator operator-()
+        {
+            constexpr int incpriX = 1;
+            constexpr int inccarX = 1;
+            ReproducibleFloatingAccumulator temp = *this;
+            if (primary(0) != 0.0)
+            {
+                const auto* const bins = binned_bins(binned_index());
+                for (int i = 0; i < FOLD; i++)
+                {
+                    temp.primary(i * incpriX) =
+                        bins[i] - (primary(i * incpriX) - bins[i]);
+                    temp.carry(i * inccarX) = -carry(i * inccarX);
+                }
+            }
+            return temp;
+        }
+
+        ///Convert this binned fp into its native floating-point representation
+        ftype conv() const
+        {
+            if (std::is_same_v<ftype, float>)
+            {
+                return binned_conv_single(1, 1);
+            }
+            else
+            {
+                return binned_conv_double(1, 1);
+            }
+        }
+
+        ///@brief Get binned fp summation error bound
+        ///
+        ///This is a bound on the absolute error of a summation using binned types
+        ///
+        ///@param N           The number of single precision floating point summands
+        ///@param max_abs_val The summand of maximum absolute value
+        ///@param binned_sum  The value of the sum computed using binned types
+        ///@return            The absolute error bound
+        static constexpr ftype error_bound(
+            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
+        {
+            const double X = std::abs(max_abs_val);
+            const double S = std::abs(binned_sum);
+            return static_cast<ftype>(max(X, ldexp(0.5, MIN_EXP - 1)) *
+                    ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
+                ((7.0 * EPSILON) /
+                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
+                        7.0 * EPSILON)) *
+                    S);
+        }
+
+        ///Add @p x to the binned fp
+        void add(const ftype x)
+        {
+            binned_dmdadd(x, 1, 1);
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last, const ftype max_abs_val)
+        {
+            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
+            size_t count = 0;
+            size_t N = last - first;
+            for (; first != last; first++, count++)
+            {
+                binned_dmddeposit(static_cast<ftype>(*first), 1);
+                // first conditional allows compiler to remove the call here when possible
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        ///Add arithmetics in the range [first, last) to the binned fp
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param first       Start of range
+        ///@param last        End of range
+        template <typename InputIt>
+        void add(InputIt first, InputIt last)
+        {
+            const auto max_abs_val = *std::max_element(
+                first, last, [](const auto& a, const auto& b) {
+                    return std::abs(a) < std::abs(b);
+                });
+            add(first, last, static_cast<ftype>(max_abs_val));
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        ///@param max_abs_val Maximum absolute value of any member of the range
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N, const ftype max_abs_val)
+        {
+            if (N == 0)
+                return;
+            add(input, input + N, max_abs_val);
+        }
+
+        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
+        ///
+        ///NOTE: A maximum absolute value is calculated, so two passes are made over
+        ///      the data
+        ///
+        ///@param input       Start of the range
+        ///@param N           Number of elements to add
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void add(const T* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            T max_abs_val = input[0];
+            for (size_t i = 0; i < N; i++)
+            {
+                max_abs_val = max(max_abs_val, std::abs(input[i]));
+            }
+            add(input, N, max_abs_val);
+        }
+
+        ///Accumulate a float4 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float4& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            binned_dmddeposit(static_cast<ftype>(x.z), 1);
+            binned_dmddeposit(static_cast<ftype>(x.w), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const float2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        ///Accumulate a double2 @p x into the binned fp.
+        ///NOTE: Casts @p x to the type of the binned fp
+        ReproducibleFloatingAccumulator& operator+=(const double2& x)
+        {
+            binned_dmdupdate(abs_max(x), 1, 1);
+            binned_dmddeposit(static_cast<ftype>(x.x), 1);
+            binned_dmddeposit(static_cast<ftype>(x.y), 1);
+            return *this;
+        }
+
+        void add(const float4* input, const size_t N, float max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const double2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float2* input, const size_t N, double max_abs_val)
+        {
+            if (N == 0)
+                return;
+            binned_dmdupdate(max_abs_val, 1, 1);
+
+            size_t count = 0;
+            for (size_t i = 0; i < N; i++)
+            {
+                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
+                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
+
+                if (N > ENDURANCE && count == ENDURANCE)
+                {
+                    binned_dmrenorm(1, 1);
+                    count = 0;
+                }
+            }
+        }
+
+        void add(const float4* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const double2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        void add(const float2* input, const size_t N)
+        {
+            if (N == 0)
+                return;
+
+            auto max_abs_val = abs_max(input[0]);
+            for (size_t i = 1; i < N; i++)
+                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
+
+            add(input, N, max_abs_val);
+        }
+
+        //////////////////////////////////////
+        //MANUAL OPERATIONS; USE WISELY
+        //////////////////////////////////////
+
+        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
+        ///
+        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
+        ///with `unsafe_add` after which `renorm()` must be called. See the source of
+        ///`add()` for an example
+        template <typename T,
+            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+        void set_max_abs_val(const T mav)
+        {
+            binned_dmdupdate(std::abs(mav), 1, 1);
+        }
+
+        ///Add @p x to the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()`
+        void unsafe_add(const ftype x)
+        {
+            binned_dmddeposit(x, 1);
+        }
+
+        ///Renormalizes the binned fp
+        ///
+        ///This is intended to be used after a call to `set_max_abs_val()` and one or
+        ///more calls to `unsafe_add()`
+        void renorm()
+        {
+            binned_dmrenorm(1, 1);
+        }
+    };
+
+    
+}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 559ee830030e..76dc5fcd9806 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -246,3 +246,7 @@ foreach(test ${tests})
     "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS}
   )
 endforeach()
+
+target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address)
+
+target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address)
\ No newline at end of file
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index c8271da6ac6b..1119c8113c6b 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -17,7 +17,6 @@
 #include <limits>
 #include <numeric>
 #include <random>
-#include <string>
 #include <vector>
 
 #include "test_utils.hpp"

From 5c0129f6fa761da67da30af1283c2304736885b1 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sat, 7 Dec 2024 00:44:59 -0600
Subject: [PATCH 04/19] Remove unnecessary things from rfa

- Also perform renorm and update only when necessary

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |    2 +-
 .../parallel/algorithms/detail/rfa_cuda.hpp   | 1168 -----------------
 2 files changed, 1 insertion(+), 1169 deletions(-)
 delete mode 100644 libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 7e29adce97cf..b37730889172 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -9,7 +9,7 @@
 #include <hpx/config.hpp>
 #include <hpx/functional/detail/tag_fallback_invoke.hpp>
 #include <hpx/functional/invoke.hpp>
-#include <hpx/parallel/algorithms/detail/rfa_cuda.hpp>
+#include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/util/loop.hpp>
 
 #include <cstddef>
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
deleted file mode 100644
index 05f71d9ae746..000000000000
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa_cuda.hpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-//Reproducible Floating Point Accumulations via Binned Floating Point
-//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0.
-//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel.
-//
-//The code accomplishes several objectives:
-//
-//1. Reproducible summation, independent of summation order, assuming only a
-//   subset of the IEEE 754 Floating Point Standard
-//
-//2. Has accuracy at least as good as conventional summation, and tunable
-//
-//3. Handles overflow, underflow, and other exceptions reproducibly.
-//
-//4. Makes only one read-only pass over the summands.
-//
-//5. Requires only one parallel reduction.
-//
-//6. Uses minimal memory (6 doubles per accumulator with fold=3).
-//
-//7. Relatively easy to use
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#ifndef __CUDACC__
-#define __host__
-#define __device__
-#define __forceinline__
-#include <array>
-using std::array;
-using std::max;
-using std::min;
-#else
-#include <cuda/std/array>
-using cuda::std::array;
-using cuda::std::max;
-using cuda::std::min;
-#include "vector.hpp"
-#endif
-
-namespace hpx::parallel::detail::rfa {
-    template <typename F>
-    struct type4
-    {
-        F x;
-        F y;
-        F z;
-        F w;
-    };
-
-    template <typename F>
-    struct type2
-    {
-        F x;
-        F y;
-    };
-    using float4 = type4<float>;
-    using double4 = type4<double>;
-    using float2 = type2<float>;
-    using double2 = type2<double>;
-
-    auto abs_max(float4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<float> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double4 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        auto z = std::abs(a.z);
-        auto w = std::abs(a.w);
-        const std::vector<double> v = {x, y, z, w};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(float2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<float> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-    auto abs_max(double2 a)
-    {
-        auto x = std::abs(a.x);
-        auto y = std::abs(a.y);
-        const std::vector<double> v = {x, y};
-        return *std::max_element(v.begin(), v.end());
-    }
-
-// disable zero checks
-#define DISABLE_ZERO
-
-// disable nan / infinity checks
-#define DISABLE_NANINF
-
-// jump table for indexing into data
-#define MAX_JUMP 5
-    static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max");
-
-    template <typename Real>
-    inline constexpr Real ldexp_impl(Real arg, int exp) noexcept
-    {
-        return std::ldexp(arg, exp);
-        // while (arg == 0)
-        // {
-        //     return arg;
-        // }
-        // while (exp > 0)
-        // {
-        //     arg *= static_cast<Real>(2);
-        //     --exp;
-        // }
-        // while (exp < 0)
-        // {
-        //     arg /= static_cast<Real>(2);
-        //     ++exp;
-        // }
-
-        // return arg;
-    }
-
-    template <class ftype>
-    struct RFA_bins
-    {
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-
-        ///The binned floating-point reference bins
-        array<ftype, MAXINDEX + MAXFOLD> bins = {};
-
-        constexpr ftype& operator[](int d)
-        {
-            return bins[d];
-        }
-
-        void initialize_bins()
-        {
-            if constexpr (std::is_same_v<ftype, float>)
-            {
-                bins[0] = std::ldexp(0.75, MAX_EXP);
-            }
-            else
-            {
-                bins[0] = 2.0 * ldexp(0.75, MAX_EXP - 1);
-            }
-
-            for (int index = 1; index <= MAXINDEX; index++)
-            {
-                bins[index] = ldexp(0.75,
-                    MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH);
-            }
-            for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++)
-            {
-                bins[index] = bins[index - 1];
-            }
-        }
-    };
-
-    static char bin_host_buffer[sizeof(RFA_bins<double>)];
-#ifdef __CUDACC__
-    __constant__ static char bin_device_buffer[sizeof(RFA_bins<double>)];
-#endif
-
-    ///Class to hold a reproducible summation of the numbers passed to it
-    ///
-    ///@param ftype Floating-point data type; either `float` or `double
-    ///@param FOLD  The fold; use 3 as a default unless you understand it.
-    template <class ftype_, int FOLD_ = 3,
-        typename std::enable_if_t<std::is_floating_point<ftype_>::value>* =
-            nullptr>
-    class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator
-    {
-    public:
-        using ftype = ftype_;
-        static constexpr int FOLD = FOLD_;
-
-    private:
-        array<ftype, 2 * FOLD> data = {0};
-
-        ///Floating-point precision bin width
-        static constexpr auto BIN_WIDTH =
-            std::is_same_v<ftype, double> ? 40 : 13;
-        static constexpr auto MIN_EXP =
-            std::numeric_limits<ftype>::min_exponent;
-        static constexpr auto MAX_EXP =
-            std::numeric_limits<ftype>::max_exponent;
-        static constexpr auto MANT_DIG = std::numeric_limits<ftype>::digits;
-        ///Binned floating-point maximum index
-        static constexpr auto MAXINDEX =
-            ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1;
-        //The maximum floating-point fold supported by the library
-        static constexpr auto MAXFOLD = MAXINDEX + 1;
-        ///Binned floating-point compression factor
-        ///This factor is used to scale down inputs before deposition into the bin of
-        ///highest index
-        static constexpr auto COMPRESSION =
-            1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1));
-        ///Binned double precision expansion factor
-        ///This factor is used to scale up inputs after deposition into the bin of
-        ///highest index
-        static constexpr auto EXPANSION =
-            1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1));
-        static constexpr auto EXP_BIAS = MAX_EXP - 2;
-        static constexpr auto EPSILON = std::numeric_limits<ftype>::epsilon();
-        ///Binned floating-point deposit endurance
-        ///The number of deposits that can be performed before a renorm is necessary.
-        ///Applies also to binned complex double precision.
-        static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2);
-
-        ///Return a binned floating-point reference bin
-        inline const ftype* binned_bins(const int x) const
-        {
-#ifdef __CUDA_ARCH__    // must be arch not CC here
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_device_buffer)[x];
-#else
-            return &reinterpret_cast<RFA_bins<ftype>&>(bin_host_buffer)[x];
-#endif
-        }
-
-        ///Get the bit representation of a float
-        static inline uint32_t& get_bits(float& x)
-        {
-            return *reinterpret_cast<uint32_t*>(&x);
-        }
-        ///Get the bit representation of a double
-        static inline uint64_t& get_bits(double& x)
-        {
-            return *reinterpret_cast<uint64_t*>(&x);
-        }
-        ///Get the bit representation of a const float
-        static inline uint32_t get_bits(const float& x)
-        {
-            return *reinterpret_cast<const uint32_t*>(&x);
-        }
-        ///Get the bit representation of a const double
-        static inline uint64_t get_bits(const double& x)
-        {
-            return *reinterpret_cast<const uint64_t*>(&x);
-        }
-
-        ///Return primary vector value const ref
-        inline const ftype& primary(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[4];
-                default:
-                    return data[FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[i];
-            }
-        }
-
-        ///Return carry vector value const ref
-        inline const ftype& carry(int i) const
-        {
-            if constexpr (FOLD <= MAX_JUMP)
-            {
-                switch (i)
-                {
-                case 0:
-                    if constexpr (FOLD >= 1)
-                        return data[FOLD + 0];
-                case 1:
-                    if constexpr (FOLD >= 2)
-                        return data[FOLD + 1];
-                case 2:
-                    if constexpr (FOLD >= 3)
-                        return data[FOLD + 2];
-                case 3:
-                    if constexpr (FOLD >= 4)
-                        return data[FOLD + 3];
-                case 4:
-                    if constexpr (FOLD >= 5)
-                        return data[FOLD + 4];
-                default:
-                    return data[2 * FOLD - 1];
-                }
-            }
-            else
-            {
-                return data[FOLD + i];
-            }
-        }
-
-        ///Return primary vector value ref
-        inline ftype& primary(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.primary(i));
-        }
-
-        ///Return carry vector value ref
-        inline ftype& carry(int i)
-        {
-            const auto& c = *this;
-            return const_cast<ftype&>(c.carry(i));
-        }
-
-#ifdef DISABLE_ZERO
-        static inline constexpr bool ISZERO(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr bool ISZERO(const ftype x)
-        {
-            return x == 0.0;
-        }
-#endif
-
-#ifdef DISABLE_NANINF
-        static inline constexpr int ISNANINF(const ftype)
-        {
-            return false;
-        }
-#else
-        static inline constexpr int ISNANINF(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) ==
-                ((2ull * MAX_EXP - 1) << (MANT_DIG - 1));
-        }
-#endif
-
-        static inline constexpr int EXP(const ftype x)
-        {
-            const auto bits = get_bits(x);
-            return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1);
-        }
-
-        ///Get index of float-point precision
-        ///The index of a non-binned type is the smallest index a binned type would
-        ///need to have to sum it reproducibly. Higher indicies correspond to smaller
-        ///bins.
-        static inline constexpr int binned_dindex(const ftype x)
-        {
-            int exp = EXP(x);
-            if (exp == 0)
-            {
-                if (x == 0.0)
-                {
-                    return MAXINDEX;
-                }
-                else
-                {
-                    frexp(x, &exp);
-                    return min((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX);
-                }
-            }
-            return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH;
-        }
-
-        ///Get index of manually specified binned double precision
-        ///The index of a binned type is the bin that it corresponds to. Higher
-        ///indicies correspond to smaller bins.
-        inline int binned_index() const
-        {
-            return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) -
-                       EXP(primary(0))) /
-                BIN_WIDTH;
-        }
-
-        ///Check if index of manually specified binned floating-point is 0
-        ///A quick check to determine if the index is 0
-        inline bool binned_index0() const
-        {
-            return EXP(primary(0)) == MAX_EXP + EXP_BIAS;
-        }
-
-        ///Update manually specified binned fp with a scalar (X -> Y)
-        ///
-        ///This method updates the binned fp to an index suitable for adding numbers
-        ///with absolute value less than @p max_abs_val
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdupdate(
-            const ftype max_abs_val, const int incpriY, const int inccarY)
-        {
-            if (ISNANINF(primary(0)))
-                return;
-
-            int X_index = binned_dindex(max_abs_val);
-            if (ISZERO(primary(0)))
-            {
-                const ftype* const bins = binned_bins(X_index);
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = bins[i];
-                    carry(i * inccarY) = 0.0;
-                }
-            }
-            else
-            {
-                int shift = binned_index() - X_index;
-                if (shift > 0)
-                {
-#pragma unroll
-                    for (int i = FOLD - 1; i >= 1; i--)
-                    {
-                        if (i < shift)
-                            break;
-                        primary(i * incpriY) = primary((i - shift) * incpriY);
-                        carry(i * inccarY) = carry((i - shift) * inccarY);
-                    }
-                    const ftype* const bins = binned_bins(X_index);
-#pragma unroll
-                    for (int j = 0; j < FOLD; j++)
-                    {
-                        if (j >= shift)
-                            break;
-                        primary(j * incpriY) = bins[j];
-                        carry(j * inccarY) = 0.0;
-                    }
-                }
-            }
-        }
-
-        ///Add scalar @p X to suitably binned manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y where the index of Y is
-        ///larger than the index of @p X
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        void binned_dmddeposit(const ftype X, const int incpriY)
-        {
-            ftype M;
-            ftype x = X;
-
-            if (ISNANINF(x) || ISNANINF(primary(0)))
-            {
-                primary(0) += x;
-                return;
-            }
-
-            if (binned_index0())
-            {
-                M = primary(0);
-                ftype qd = x * COMPRESSION;
-                auto& ql = get_bits(qd);
-                ql |= 1;
-                qd += M;
-                primary(0) = qd;
-                M -= qd;
-                M *= EXPANSION * 0.5;
-                x += M;
-                x += M;
-#pragma unroll
-                for (int i = 1; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-            else
-            {
-                ftype qd = x;
-                auto& ql = get_bits(qd);
-#pragma unroll
-                for (int i = 0; i < FOLD - 1; i++)
-                {
-                    M = primary(i * incpriY);
-                    qd = x;
-                    ql |= 1;
-                    qd += M;
-                    primary(i * incpriY) = qd;
-                    M -= qd;
-                    x += M;
-                }
-                qd = x;
-                ql |= 1;
-                primary((FOLD - 1) * incpriY) += qd;
-            }
-        }
-
-        ///Renormalize manually specified binned double precision
-        ///
-        ///Renormalization keeps the primary vector within the necessary bins by
-        ///shifting over to the carry vector
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        inline void binned_dmrenorm(const int incpriX, const int inccarX)
-        {
-            if (ISZERO(primary(0)) || ISNANINF(primary(0)))
-                return;
-
-            for (int i = 0; i < FOLD; i++)
-            {
-                auto tmp_renormd = primary(i * incpriX);
-                auto& tmp_renorml = get_bits(tmp_renormd);
-
-                carry(i * inccarX) +=
-                    (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2;
-
-                tmp_renorml &= ~(1ull << (MANT_DIG - 3));
-                tmp_renorml |= 1ull << (MANT_DIG - 2);
-                primary(i * incpriX) = tmp_renormd;
-            }
-        }
-
-        ///Add scalar to manually specified binned fp (Y += X)
-        ///
-        ///Performs the operation Y += X on an binned type Y
-        ///
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdadd(const ftype X, const int incpriY, const int inccarY)
-        {
-            binned_dmdupdate(X, incpriY, inccarY);
-            binned_dmddeposit(X, incpriY);
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Convert manually specified binned fp to native double-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        double binned_conv_double(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            double Y = 0.0;
-            double scale_down;
-            double scale_up;
-            int scaled;
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index <= (3 * MANT_DIG) / BIN_WIDTH)
-            {
-                scale_down = ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH));
-                scale_up = ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH));
-                scaled =
-                    max(min(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0);
-                if (X_index == 0)
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down * EXPANSION);
-                    Y += carry(inccarX) * ((bins[1] / 6.0) * scale_down);
-                    Y += (primary(0) - bins[0]) * scale_down * EXPANSION;
-                    i = 2;
-                }
-                else
-                {
-                    Y += carry(0) * ((bins[0] / 6.0) * scale_down);
-                    i = 1;
-                }
-                for (; i < scaled; i++)
-                {
-                    Y += carry(i * inccarX) * ((bins[i] / 6.0) * scale_down);
-                    Y +=
-                        (primary((i - 1) * incpriX) - bins[i - 1]) * scale_down;
-                }
-                if (i == FOLD)
-                {
-                    Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]) *
-                        scale_down;
-                    return Y * scale_up;
-                }
-                if (std::isinf(Y * scale_up))
-                {
-                    return Y * scale_up;
-                }
-                Y *= scale_up;
-                for (; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += primary((i - 1) * incpriX) - bins[i - 1];
-                }
-                Y += primary((FOLD - 1) * incpriX) - bins[FOLD - 1];
-            }
-            else
-            {
-                Y += carry(0) * (bins[0] / 6.0);
-                for (i = 1; i < FOLD; i++)
-                {
-                    Y += carry(i * inccarX) * (bins[i] / 6.0);
-                    Y += (primary((i - 1) * incpriX) - bins[i - 1]);
-                }
-                Y += (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-            }
-            return Y;
-        }
-
-        ///Convert manually specified binned fp to native single-precision (X -> Y)
-        ///
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        float binned_conv_single(const int incpriX, const int inccarX) const
-        {
-            int i = 0;
-            double Y = 0.0;
-
-            if (ISNANINF(primary(0)))
-                return primary(0);
-            if (ISZERO(primary(0)))
-                return 0.0;
-
-            //Note that the following order of summation is in order of decreasing
-            //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and
-            //the number of carries equal to 1.
-            const auto X_index = binned_index();
-            const auto* const bins = binned_bins(X_index);
-            if (X_index == 0)
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0) *
-                    (double) EXPANSION;
-                Y += (double) carry(inccarX) * (double) (bins[1] / 6.0);
-                Y += (double) (primary(0) - bins[0]) * (double) EXPANSION;
-                i = 2;
-            }
-            else
-            {
-                Y += (double) carry(0) * (double) (bins[0] / 6.0);
-                i = 1;
-            }
-            for (; i < FOLD; i++)
-            {
-                Y += (double) carry(i * inccarX) * (double) (bins[i] / 6.0);
-                Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]);
-            }
-            Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]);
-
-            return (float) Y;
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        ///
-        ///@param other   Another binned fp of the same type
-        ///@param incpriX stride within X's primary vector (use every incpriX'th element)
-        ///@param inccarX stride within X's carry vector (use every inccarX'th element)
-        ///@param incpriY stride within Y's primary vector (use every incpriY'th element)
-        ///@param inccarY stride within Y's carry vector (use every inccarY'th element)
-        void binned_dmdmadd(const ReproducibleFloatingAccumulator& x,
-            const int incpriX, const int inccarX, const int incpriY,
-            const int inccarY)
-        {
-            if (ISZERO(x.primary(0)))
-                return;
-
-            if (ISZERO(primary(0)))
-            {
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-                return;
-            }
-
-            if (ISNANINF(x.primary(0)) || ISNANINF(primary(0)))
-            {
-                primary(0) += x.primary(0);
-                return;
-            }
-
-            const auto X_index = x.binned_index();
-            const auto Y_index = this->binned_index();
-            const auto shift = Y_index - X_index;
-            if (shift > 0)
-            {
-                const auto* const bins = binned_bins(Y_index);
-                //shift Y upwards and add X to Y
-#pragma unroll
-                for (int i = FOLD - 1; i >= 1; i--)
-                {
-                    if (i < shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX) +
-                        (primary((i - shift) * incpriY) - bins[i - shift]);
-                    carry(i * inccarY) =
-                        x.carry(i * inccarX) + carry((i - shift) * inccarY);
-                }
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i == shift)
-                        break;
-                    primary(i * incpriY) = x.primary(i * incpriX);
-                    carry(i * inccarY) = x.carry(i * inccarX);
-                }
-            }
-            else if (shift < 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                //shift X upwards and add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    if (i < -shift)
-                        continue;
-                    primary(i * incpriY) +=
-                        x.primary((i + shift) * incpriX) - bins[i + shift];
-                    carry(i * inccarY) += x.carry((i + shift) * inccarX);
-                }
-            }
-            else if (shift == 0)
-            {
-                const auto* const bins = binned_bins(X_index);
-                // add X to Y
-#pragma unroll
-                for (int i = 0; i < FOLD; i++)
-                {
-                    primary(i * incpriY) += x.primary(i * incpriX) - bins[i];
-                    carry(i * inccarY) += x.carry(i * inccarX);
-                }
-            }
-
-            binned_dmrenorm(incpriY, inccarY);
-        }
-
-        ///Add two manually specified binned fp (Y += X)
-        ///Performs the operation Y += X
-        void binned_dbdbadd(const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dmdmadd(other, 1, 1, 1, 1);
-        }
-
-    public:
-        ReproducibleFloatingAccumulator() = default;
-        ReproducibleFloatingAccumulator(
-            const ReproducibleFloatingAccumulator&) = default;
-        ///Sets this binned fp equal to another binned fp
-        ReproducibleFloatingAccumulator& operator=(
-            const ReproducibleFloatingAccumulator&) = default;
-
-        ///Set the binned fp to zero
-        void zero()
-        {
-            data = {0};
-        }
-
-        ///Return the fold of the binned fp
-        constexpr int fold() const
-        {
-            return FOLD;
-        }
-
-        ///Return the endurance of the binned fp
-        constexpr int endurance() const
-        {
-            return ENDURANCE;
-        }
-
-        ///Returns the number of reference bins. Used for judging memory usage.
-        constexpr size_t number_of_reference_bins()
-        {
-            return array<ftype, MAXINDEX + MAXFOLD>::size();
-        }
-
-        ///Accumulate an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator+=(const U x)
-        {
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate-subtract an arithmetic @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator-=(const U x)
-        {
-            binned_dmdadd(-static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Accumulate a binned fp @p x into the binned fp.
-        ReproducibleFloatingAccumulator& operator+=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            binned_dbdbadd(other);
-            return *this;
-        }
-
-        ///Accumulate-subtract a binned fp @p x into the binned fp.
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator& operator-=(
-            const ReproducibleFloatingAccumulator& other)
-        {
-            const auto temp = -other;
-            binned_dbdbadd(temp);
-        }
-
-        ///Determines if two binned fp are equal
-        bool operator==(const ReproducibleFloatingAccumulator& other) const
-        {
-            return data == other.data;
-        }
-
-        ///Determines if two binned fp are not equal
-        bool operator!=(const ReproducibleFloatingAccumulator& other) const
-        {
-            return !operator==(other);
-        }
-
-        ///Sets this binned fp equal to the arithmetic value @p x
-        ///NOTE: Casts @p x to the type of the binned fp
-        template <typename U,
-            typename std::enable_if_t<std::is_arithmetic_v<U>>* = nullptr>
-        ReproducibleFloatingAccumulator& operator=(const U x)
-        {
-            zero();
-            binned_dmdadd(static_cast<ftype>(x), 1, 1);
-            return *this;
-        }
-
-        ///Returns the negative of this binned fp
-        ///NOTE: Makes a copy and performs arithmetic; slow.
-        ReproducibleFloatingAccumulator operator-()
-        {
-            constexpr int incpriX = 1;
-            constexpr int inccarX = 1;
-            ReproducibleFloatingAccumulator temp = *this;
-            if (primary(0) != 0.0)
-            {
-                const auto* const bins = binned_bins(binned_index());
-                for (int i = 0; i < FOLD; i++)
-                {
-                    temp.primary(i * incpriX) =
-                        bins[i] - (primary(i * incpriX) - bins[i]);
-                    temp.carry(i * inccarX) = -carry(i * inccarX);
-                }
-            }
-            return temp;
-        }
-
-        ///Convert this binned fp into its native floating-point representation
-        ftype conv() const
-        {
-            if (std::is_same_v<ftype, float>)
-            {
-                return binned_conv_single(1, 1);
-            }
-            else
-            {
-                return binned_conv_double(1, 1);
-            }
-        }
-
-        ///@brief Get binned fp summation error bound
-        ///
-        ///This is a bound on the absolute error of a summation using binned types
-        ///
-        ///@param N           The number of single precision floating point summands
-        ///@param max_abs_val The summand of maximum absolute value
-        ///@param binned_sum  The value of the sum computed using binned types
-        ///@return            The absolute error bound
-        static constexpr ftype error_bound(
-            const uint64_t N, const ftype max_abs_val, const ftype binned_sum)
-        {
-            const double X = std::abs(max_abs_val);
-            const double S = std::abs(binned_sum);
-            return static_cast<ftype>(max(X, ldexp(0.5, MIN_EXP - 1)) *
-                    ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N +
-                ((7.0 * EPSILON) /
-                    (1.0 - 6.0 * std::sqrt(static_cast<double>(EPSILON)) -
-                        7.0 * EPSILON)) *
-                    S);
-        }
-
-        ///Add @p x to the binned fp
-        void add(const ftype x)
-        {
-            binned_dmdadd(x, 1, 1);
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last, const ftype max_abs_val)
-        {
-            binned_dmdupdate(std::abs(max_abs_val), 1, 1);
-            size_t count = 0;
-            size_t N = last - first;
-            for (; first != last; first++, count++)
-            {
-                binned_dmddeposit(static_cast<ftype>(*first), 1);
-                // first conditional allows compiler to remove the call here when possible
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        ///Add arithmetics in the range [first, last) to the binned fp
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param first       Start of range
-        ///@param last        End of range
-        template <typename InputIt>
-        void add(InputIt first, InputIt last)
-        {
-            const auto max_abs_val = *std::max_element(
-                first, last, [](const auto& a, const auto& b) {
-                    return std::abs(a) < std::abs(b);
-                });
-            add(first, last, static_cast<ftype>(max_abs_val));
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        ///@param max_abs_val Maximum absolute value of any member of the range
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N, const ftype max_abs_val)
-        {
-            if (N == 0)
-                return;
-            add(input, input + N, max_abs_val);
-        }
-
-        ///Add @p N elements starting at @p input to the binned fp: [input, input+N)
-        ///
-        ///NOTE: A maximum absolute value is calculated, so two passes are made over
-        ///      the data
-        ///
-        ///@param input       Start of the range
-        ///@param N           Number of elements to add
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void add(const T* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            T max_abs_val = input[0];
-            for (size_t i = 0; i < N; i++)
-            {
-                max_abs_val = max(max_abs_val, std::abs(input[i]));
-            }
-            add(input, N, max_abs_val);
-        }
-
-        ///Accumulate a float4 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float4& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            binned_dmddeposit(static_cast<ftype>(x.z), 1);
-            binned_dmddeposit(static_cast<ftype>(x.w), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const float2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        ///Accumulate a double2 @p x into the binned fp.
-        ///NOTE: Casts @p x to the type of the binned fp
-        ReproducibleFloatingAccumulator& operator+=(const double2& x)
-        {
-            binned_dmdupdate(abs_max(x), 1, 1);
-            binned_dmddeposit(static_cast<ftype>(x.x), 1);
-            binned_dmddeposit(static_cast<ftype>(x.y), 1);
-            return *this;
-        }
-
-        void add(const float4* input, const size_t N, float max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].z), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].w), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const double2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float2* input, const size_t N, double max_abs_val)
-        {
-            if (N == 0)
-                return;
-            binned_dmdupdate(max_abs_val, 1, 1);
-
-            size_t count = 0;
-            for (size_t i = 0; i < N; i++)
-            {
-                binned_dmddeposit(static_cast<ftype>(input[i].x), 1);
-                binned_dmddeposit(static_cast<ftype>(input[i].y), 1);
-
-                if (N > ENDURANCE && count == ENDURANCE)
-                {
-                    binned_dmrenorm(1, 1);
-                    count = 0;
-                }
-            }
-        }
-
-        void add(const float4* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const double2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        void add(const float2* input, const size_t N)
-        {
-            if (N == 0)
-                return;
-
-            auto max_abs_val = abs_max(input[0]);
-            for (size_t i = 1; i < N; i++)
-                max_abs_val = fmax(max_abs_val, abs_max(input[i]));
-
-            add(input, N, max_abs_val);
-        }
-
-        //////////////////////////////////////
-        //MANUAL OPERATIONS; USE WISELY
-        //////////////////////////////////////
-
-        ///Rebins for repeated accumulation of scalars with magnitude <= @p mav
-        ///
-        ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator
-        ///with `unsafe_add` after which `renorm()` must be called. See the source of
-        ///`add()` for an example
-        template <typename T,
-            typename std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
-        void set_max_abs_val(const T mav)
-        {
-            binned_dmdupdate(std::abs(mav), 1, 1);
-        }
-
-        ///Add @p x to the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()`
-        void unsafe_add(const ftype x)
-        {
-            binned_dmddeposit(x, 1);
-        }
-
-        ///Renormalizes the binned fp
-        ///
-        ///This is intended to be used after a call to `set_max_abs_val()` and one or
-        ///more calls to `unsafe_add()`
-        void renorm()
-        {
-            binned_dmrenorm(1, 1);
-        }
-    };
-
-    
-}    // namespace hpx::parallel::detail::rfa
\ No newline at end of file

From 08ed522913ca9d5bbf2f94f1d5e97194484f46e0 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Mon, 9 Dec 2024 18:52:14 -0600
Subject: [PATCH 05/19] Added parallel execution of rfa reduction summation

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           | 93 +++++++++++++++++++
 .../algorithms/reduce_deterministic.hpp       |  1 +
 .../unit/algorithms/reduce_deterministic.cpp  | 44 +++++++++
 3 files changed, 138 insertions(+)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index b37730889172..35f128a5aa4e 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -11,6 +11,7 @@
 #include <hpx/functional/invoke.hpp>
 #include <hpx/parallel/algorithms/detail/rfa.hpp>
 #include <hpx/parallel/util/loop.hpp>
+#include <hpx/type_support/pack.hpp>
 
 #include <cstddef>
 #include <cstring>
@@ -65,6 +66,84 @@ namespace hpx::parallel::detail {
         }
     };
 
+    template <typename ExPolicy>
+    struct sequential_reduce_deterministic_rfa_t final
+      : hpx::functional::detail::tag_fallback<
+            sequential_reduce_deterministic_rfa_t<ExPolicy>>
+    {
+    private:
+        template <typename InIterB, typename InIterE, typename T,
+            typename Reduce>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+
+            for (auto e = first; e != last; ++e)
+            {
+                rfa += *e;
+            }
+            return rfa;
+        }
+
+        template <typename InIterB, typename T, typename Reduce>
+        friend constexpr hpx::parallel::detail::rfa::
+            ReproducibleFloatingAccumulator<T>
+            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
+                ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r)
+        {
+            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            bins.initialize_bins();
+            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+            auto e = first;
+            for (std::size_t i = 0; i < size; ++i, ++e)
+            {
+                rfa += *e;
+            }
+            return rfa;
+        }
+
+        // template <typename InIterB, typename InIterE, typename T,
+        //     typename Reduce
+        //     // typename = std::enable_if_t<hpx::util::contains<T,
+        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //     //         float>,
+        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //     //         double>>::value>
+        //     >
+        // friend constexpr T tag_fallback_invoke(
+        //     sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
+        //     InIterE last, T init, Reduce&& r)
+        // {
+        //     static_assert(hpx::util::contains<T,
+        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //             float>,
+        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+        //             double>>::value);
+        //     hpx::parallel::detail::rfa::RFA_bins<T> bins;
+        //     bins.initialize_bins();
+        //     std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+
+        //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
+        //     rfa.set_max_abs_val(init);
+        //     rfa.unsafe_add(init);
+        //     rfa.renorm();
+        //     for (auto e = first; e != last; ++e)
+        //     {
+        //         rfa += *e;
+        //     }
+        //     return rfa.conv();
+        // }
+    };
+
 #if !defined(HPX_COMPUTE_DEVICE_CODE)
     template <typename ExPolicy>
     inline constexpr sequential_reduce_deterministic_t<ExPolicy>
@@ -80,4 +159,18 @@ namespace hpx::parallel::detail {
     }
 #endif
 
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+    template <typename ExPolicy>
+    inline constexpr sequential_reduce_deterministic_rfa_t<ExPolicy>
+        sequential_reduce_deterministic_rfa =
+            sequential_reduce_deterministic_rfa_t<ExPolicy>{};
+#else
+    template <typename ExPolicy, typename... Args>
+    HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic_rfa(
+        Args&&... args)
+    {
+        return sequential_reduce_deterministic_rfa_t<ExPolicy>{}(
+            std::forward<Args>(args)...);
+    }
+#endif
 }    // namespace hpx::parallel::detail
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 5e9da5edd3ae..2bebafc41300 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "detail/reduce_deterministic.hpp"
 #if defined(DOXYGEN)
 
 namespace hpx {
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 1119c8113c6b..ac495f0ec56f 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -80,6 +80,49 @@ void test_reduce1(IteratorTag)
     HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
 }
 
+template <typename IteratorTag, typename FloatTypeDeterministic,
+    typename FloatTypeNonDeterministic, size_t LEN = 10007>
+void test_reduce_parallel1(IteratorTag)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result i.e. correct computation
+    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
+
+    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
+
+    std::vector<FloatTypeDeterministic> deterministic(LEN);
+    std::vector<FloatTypeNonDeterministic> nondeterministic(LEN);
+
+    std::iota(
+        deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0));
+
+    std::iota(nondeterministic.begin(), nondeterministic.end(),
+        FloatTypeNonDeterministic(0));
+
+    FloatTypeDeterministic val_det(0);
+    FloatTypeNonDeterministic val_non_det(0);
+    auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) {
+        return v1 + v2;
+    };
+
+    FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par,
+        iterator_det(std::begin(deterministic)),
+        iterator_det(std::end(deterministic)), val_det, op);
+
+    // verify values
+    // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par,
+    //     iterator_ndet(std::begin(nondeterministic)),
+    //     iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+
+    FloatTypeNonDeterministic r3 = std::accumulate(
+        nondeterministic.begin(), nondeterministic.end(), val_non_det);
+
+    HPX_TEST_EQ(r1, r3);
+    // HPX_TEST_EQ(r2, r3);
+}
+
 template <typename IteratorTag, typename FloatTypeDeterministic,
     size_t LEN = 10007>
 void test_reduce_determinism(IteratorTag)
@@ -185,6 +228,7 @@ void test_reduce1()
     test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
     test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
     test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+    test_reduce_parallel1<IteratorTag, float, float, 1000>(IteratorTag());
 }
 
 template <typename IteratorTag>

From 95cd600e90c57d09cf93162faa7d3be67cc49d5b Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 17 Dec 2024 14:22:55 -0500
Subject: [PATCH 06/19] Address inspect tool, check module cmakelists, warnings
 and spell check

- missing includes
- prevent max/min being expanded as macros
- minor spell check correction
- remove pragma once in cpp file
- resolve implicit type conversions in rfa type to single and double and other places
- add dual license
- remove unnecessary command for macos ci
- use HPX_UNROLL instead of vanilla pragma
- clang-17 cannot unroll so use checks
- add typename qualifier for iterator type

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../algorithms/tests/unit/algorithms/reduce_deterministic.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index ac495f0ec56f..9ad604be2611 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -17,6 +17,7 @@
 #include <limits>
 #include <numeric>
 #include <random>
+#include <string>
 #include <vector>
 
 #include "test_utils.hpp"
@@ -89,7 +90,8 @@ void test_reduce_parallel1(IteratorTag)
     using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
-    using base_iterator_ndet = std::vector<FloatTypeNonDeterministic>::iterator;
+    using base_iterator_ndet =
+        typename std::vector<FloatTypeNonDeterministic>::iterator;
     using iterator_ndet = test::test_iterator<base_iterator_ndet, IteratorTag>;
 
     std::vector<FloatTypeDeterministic> deterministic(LEN);

From 441e53566663079b161ed1655ab138b2f5dc5070 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Wed, 1 Jan 2025 21:36:17 +0530
Subject: [PATCH 07/19] Fix parallel deterministic reduce and add benchmarks

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           |  87 ++++------
 .../algorithms/reduce_deterministic.hpp       |  41 +++++
 .../benchmark_reduce_deterministic.cpp        | 159 ++++++++++++++++++
 3 files changed, 237 insertions(+), 50 deletions(-)
 create mode 100644 libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 35f128a5aa4e..ef0ac5c93a44 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -72,76 +72,63 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_rfa_t<ExPolicy>>
     {
     private:
-        template <typename InIterB, typename InIterE, typename T,
-            typename Reduce>
+        template <typename InIterB, typename T>
         friend constexpr hpx::parallel::detail::rfa::
             ReproducibleFloatingAccumulator<T>
             tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
-                ExPolicy&&, InIterB first, InIterE last, T init, Reduce&& r)
+                ExPolicy&&, InIterB first, std::size_t partition_size, T init,
+                std::true_type&&)
         {
             hpx::parallel::detail::rfa::RFA_bins<T> bins;
             bins.initialize_bins();
-            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
-
-            for (auto e = first; e != last; ++e)
+            rfa.set_max_abs_val(init);
+            rfa.unsafe_add(init);
+            rfa.renorm();
+            size_t count = 0;
+            T max_val = std::abs(*first);
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
             {
-                rfa += *e;
+                T temp_max_val = std::abs(static_cast<T>(*e));
+                if (max_val < temp_max_val)
+                {
+                    rfa.set_max_abs_val(temp_max_val);
+                    max_val = temp_max_val;
+                }
+                rfa.unsafe_add(*e);
+                count++;
+                if (count == rfa.endurance())
+                {
+                    rfa.renorm();
+                    count = 0;
+                }
             }
             return rfa;
         }
 
-        template <typename InIterB, typename T, typename Reduce>
-        friend constexpr hpx::parallel::detail::rfa::
-            ReproducibleFloatingAccumulator<T>
-            tag_fallback_invoke(sequential_reduce_deterministic_rfa_t,
-                ExPolicy&&, InIterB first, std::size_t size, T init, Reduce&& r)
+        template <typename InIterB, typename T>
+        friend constexpr T tag_fallback_invoke(
+            sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
+            std::size_t partition_size, T init, std::false_type&&)
         {
-            hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
             bins.initialize_bins();
-            std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
+            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
 
-            hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
-            auto e = first;
-            for (std::size_t i = 0; i < size; ++i, ++e)
+            T rfa;
+            rfa += init;
+            std::size_t partition_size_lim = 0;
+            for (auto e = first; partition_size_lim <= partition_size;
+                partition_size_lim++, e++)
             {
-                rfa += *e;
+                rfa += (*e);
             }
             return rfa;
         }
-
-        // template <typename InIterB, typename InIterE, typename T,
-        //     typename Reduce
-        //     // typename = std::enable_if_t<hpx::util::contains<T,
-        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
-        //     //         float>,
-        //     //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
-        //     //         double>>::value>
-        //     >
-        // friend constexpr T tag_fallback_invoke(
-        //     sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
-        //     InIterE last, T init, Reduce&& r)
-        // {
-        //     static_assert(hpx::util::contains<T,
-        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
-        //             float>,
-        //         hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
-        //             double>>::value);
-        //     hpx::parallel::detail::rfa::RFA_bins<T> bins;
-        //     bins.initialize_bins();
-        //     std::memcpy(rfa::bin_host_buffer, &bins, sizeof(bins));
-
-        //     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
-        //     rfa.set_max_abs_val(init);
-        //     rfa.unsafe_add(init);
-        //     rfa.renorm();
-        //     for (auto e = first; e != last; ++e)
-        //     {
-        //         rfa += *e;
-        //     }
-        //     return rfa.conv();
-        // }
     };
 
 #if !defined(HPX_COMPUTE_DEVICE_CODE)
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 2bebafc41300..5503a74ce76f 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -401,6 +401,47 @@ namespace hpx::parallel {
                     ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
                     HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
             }
+
+            template <typename ExPolicy, typename FwdIterB, typename FwdIterE,
+                typename T_, typename Reduce>
+            static util::detail::algorithm_result_t<ExPolicy, T> parallel(
+                ExPolicy&& policy, FwdIterB first, FwdIterE last, T_&& init,
+                Reduce&& r)
+            {
+                (void) r;
+                if (first == last)
+                {
+                    return util::detail::algorithm_result<ExPolicy, T>::get(
+                        HPX_FORWARD(T_, init));
+                }
+
+                auto f1 = [policy](FwdIterB part_begin, std::size_t part_size)
+                    -> hpx::parallel::detail::rfa::
+                        ReproducibleFloatingAccumulator<T_> {
+                            T_ val = *part_begin;
+                            return hpx::parallel::detail::
+                                sequential_reduce_deterministic_rfa<ExPolicy>(
+                                    HPX_FORWARD(ExPolicy, policy), ++part_begin,
+                                    --part_size, HPX_MOVE(val),
+                                    std::true_type{});
+                        };
+
+                return util::partitioner<ExPolicy, T_,
+                    hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
+                        T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
+                    detail::distance(first, last), HPX_MOVE(f1),
+                    hpx::unwrapping([policy](auto&& results) -> T_ {
+                        return hpx::parallel::detail::
+                            sequential_reduce_deterministic_rfa<ExPolicy>(
+                                HPX_FORWARD(ExPolicy, policy),
+                                hpx::util::begin(results),
+                                hpx::util::size(results),
+                                hpx::parallel::detail::rfa::
+                                    ReproducibleFloatingAccumulator<T_>{},
+                                std::false_type{})
+                                .conv();
+                    }));
+            }
         };
         /// \endcond
     }    // namespace detail
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
new file mode 100644
index 000000000000..5a267dd6a634
--- /dev/null
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -0,0 +1,159 @@
+//  Copyright (c) 2024 Shreyas Atre
+//
+//  SPDX-License-Identifier: BSL-1.0
+//  Distributed under the Boost Software License, Version 1.0. (See accompanying
+//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <hpx/config.hpp>
+#include <cstddef>
+
+#if !defined(HPX_COMPUTE_DEVICE_CODE)
+#include <hpx/algorithm.hpp>
+#include <hpx/chrono.hpp>
+#include <hpx/execution.hpp>
+#include <hpx/init.hpp>
+#include <hpx/modules/testing.hpp>
+#include <hpx/parallel/algorithms/reduce.hpp>
+#include <hpx/parallel/algorithms/reduce_deterministic.hpp>
+
+#include <numeric>
+#include <random>
+#include <vector>
+
+int seed = 1000;
+std::mt19937 gen(seed);
+
+template <typename T>
+T get_rand(T LO = (std::numeric_limits<T>::min)(),
+    T HI = (std::numeric_limits<T>::max)())
+{
+    return LO +
+        static_cast<T>(std::rand()) /
+        (static_cast<T>(static_cast<T>((RAND_MAX)) / (HI - LO)));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void bench_reduce_deterministic(const auto& policy,
+    const auto& deterministic_shuffled, const auto& val_det, const auto& op)
+{
+    // check if different type for deterministic and nondeeterministic
+    // and same result
+
+    auto r1_shuffled =
+        hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled),
+            std::end(deterministic_shuffled), val_det, op);
+
+    HPX_UNUSED(r1_shuffled);
+}
+
+void bench_reduce(const auto& policy, const auto& deterministic_shuffled,
+    const auto& val_det, const auto& op)
+{
+    auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)),
+        (std::end(deterministic_shuffled)), val_det, op);
+
+    HPX_UNUSED(r);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+int hpx_main(hpx::program_options::variables_map& vm)
+{
+    std::srand(seed);
+
+    auto test_count = vm["test_count"].as<int>();
+    std::size_t vector_size = vm["vector-size"].as<std::size_t>();
+
+    hpx::util::perftests_init(vm);
+
+    // verify that input is within domain of program
+    if (test_count == 0 || test_count < 0)
+    {
+        std::cerr << "test_count cannot be zero or negative...\n" << std::flush;
+        hpx::local::finalize();
+        return -1;
+    }
+
+    {
+        using FloatTypeDeterministic = float;
+        std::size_t LEN = vector_size;
+
+        constexpr FloatTypeDeterministic num_bounds_det =
+            std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
+
+        std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+        for (size_t i = 0; i < LEN; ++i)
+        {
+            deterministic[i] = get_rand<FloatTypeDeterministic>(
+                -num_bounds_det, num_bounds_det);
+        }
+
+        std::vector<FloatTypeDeterministic> deterministic_shuffled =
+            deterministic;
+
+        std::shuffle(
+            deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
+
+        FloatTypeDeterministic val_det(41.999);
+
+        auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+            return v1 + v2;
+        };
+        {
+            hpx::util::perftests_report("reduce", "seq", test_count, [&]() {
+                bench_reduce(
+                    hpx::execution::seq, deterministic_shuffled, val_det, op);
+            });
+        }
+        {
+            hpx::util::perftests_report("reduce", "par", test_count, [&]() {
+                bench_reduce(
+                    hpx::execution::par, deterministic_shuffled, val_det, op);
+            });
+        }
+        {
+            hpx::util::perftests_report(
+                "reduce deterministic", "seq", test_count, [&]() {
+                    bench_reduce_deterministic(hpx::execution::seq,
+                        deterministic_shuffled, val_det, op);
+                });
+        }
+        {
+            hpx::util::perftests_report(
+                "reduce deterministic", "par", test_count, [&]() {
+                    bench_reduce_deterministic(hpx::execution::par,
+                        deterministic_shuffled, val_det, op);
+                });
+        }
+
+        hpx::util::perftests_print_times();
+    }
+
+    return hpx::local::finalize();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+int main(int argc, char* argv[])
+{
+    using namespace hpx::program_options;
+
+    options_description cmdline("usage: " HPX_APPLICATION_STRING " [options]");
+
+    // clang-format off
+    cmdline.add_options()
+        ("test_count", value<int>()->default_value(100),
+            "number of tests to be averaged")
+        ("vector-size", value<std::size_t>()->default_value(1000000),
+            "number of elements to be reduced")
+        ;
+    // clang-format on
+
+    hpx::util::perftests_cfg(cmdline);
+    hpx::local::init_params init_args;
+    init_args.desc_cmdline = cmdline;
+    init_args.cfg = {"hpx.os_threads=all"};
+
+    return hpx::local::init(hpx_main, argc, argv, init_args);
+}
+#endif

From b2ab51bf9fd9f975864adbdbb30937cb7752a59f Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sat, 1 Feb 2025 19:27:52 -0600
Subject: [PATCH 08/19] dbg: a negative value seeps in

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           | 33 ++++++++++---------
 .../algorithms/reduce_deterministic.hpp       | 25 +++++++++++---
 .../unit/algorithms/reduce_deterministic.cpp  | 33 ++++++++++---------
 3 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index ef0ac5c93a44..dc3ee96d6d51 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -36,9 +36,9 @@ namespace hpx::parallel::detail {
         {
             /// TODO: Put constraint on Reduce to be a binary plus operator
             (void) r;
-            hpx::parallel::detail::rfa::RFA_bins<T> bins;
-            bins.initialize_bins();
-            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+            // hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            // bins.initialize_bins();
+            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
             rfa.set_max_abs_val(init);
@@ -79,18 +79,17 @@ namespace hpx::parallel::detail {
                 ExPolicy&&, InIterB first, std::size_t partition_size, T init,
                 std::true_type&&)
         {
-            hpx::parallel::detail::rfa::RFA_bins<T> bins;
-            bins.initialize_bins();
-            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+            // hpx::parallel::detail::rfa::RFA_bins<T> bins;
+            // bins.initialize_bins();
+            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
-            rfa.set_max_abs_val(init);
-            rfa.unsafe_add(init);
-            rfa.renorm();
+            rfa.zero();
+            rfa += init;
             size_t count = 0;
             T max_val = std::abs(*first);
             std::size_t partition_size_lim = 0;
-            for (auto e = first; partition_size_lim <= partition_size;
+            for (auto e = first; partition_size_lim < partition_size;
                 partition_size_lim++, e++)
             {
                 T temp_max_val = std::abs(static_cast<T>(*e));
@@ -107,7 +106,8 @@ namespace hpx::parallel::detail {
                     count = 0;
                 }
             }
-            return rfa;
+            printf("rfa res conv: %f\n", rfa.conv());
+            return std::move(rfa);
         }
 
         template <typename InIterB, typename T>
@@ -115,17 +115,20 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
             std::size_t partition_size, T init, std::false_type&&)
         {
-            hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
-            bins.initialize_bins();
-            std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+            // hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
+            // bins.initialize_bins();
+            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
 
             T rfa;
+            rfa.zero();
             rfa += init;
             std::size_t partition_size_lim = 0;
-            for (auto e = first; partition_size_lim <= partition_size;
+            for (auto e = first; partition_size_lim < partition_size;
                 partition_size_lim++, e++)
             {
+                printf("rfa: %f rfa val before:%f\n", (*e).conv(), rfa.conv());
                 rfa += (*e);
+                printf("rfa: %f rfa val:%f\n", (*e).conv(), rfa.conv());
             }
             return rfa;
         }
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 5503a74ce76f..c508403e44eb 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -397,6 +397,10 @@ namespace hpx::parallel {
             static constexpr T sequential(ExPolicy&& policy, InIterB first,
                 InIterE last, T_&& init, Reduce&& r)
             {
+                hpx::parallel::detail::rfa::RFA_bins<T_> bins;
+                bins.initialize_bins();
+                std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__,
+                    &bins, sizeof(bins));
                 return hpx::parallel::detail::sequential_reduce_deterministic<
                     ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last,
                     HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r));
@@ -415,6 +419,11 @@ namespace hpx::parallel {
                         HPX_FORWARD(T_, init));
                 }
 
+                hpx::parallel::detail::rfa::RFA_bins<T_> bins;
+                bins.initialize_bins();
+                std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__,
+                    &bins, sizeof(bins));
+
                 auto f1 = [policy](FwdIterB part_begin, std::size_t part_size)
                     -> hpx::parallel::detail::rfa::
                         ReproducibleFloatingAccumulator<T_> {
@@ -430,14 +439,22 @@ namespace hpx::parallel {
                     hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<
                         T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
                     detail::distance(first, last), HPX_MOVE(f1),
-                    hpx::unwrapping([policy](auto&& results) -> T_ {
+                    hpx::unwrapping([policy, init](auto&& results) -> T_ {
+                        // Assumed that
+                        hpx::parallel::detail::rfa::
+                            ReproducibleFloatingAccumulator<T_>
+                                rfa;
+                        rfa.zero();
+                        rfa += init;
+                        for (auto e : results)
+                        {
+                            printf("rfa results %f\n", e.conv());
+                        }
                         return hpx::parallel::detail::
                             sequential_reduce_deterministic_rfa<ExPolicy>(
                                 HPX_FORWARD(ExPolicy, policy),
                                 hpx::util::begin(results),
-                                hpx::util::size(results),
-                                hpx::parallel::detail::rfa::
-                                    ReproducibleFloatingAccumulator<T_>{},
+                                hpx::util::size(results), HPX_MOVE(rfa),
                                 std::false_type{})
                                 .conv();
                     }));
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 9ad604be2611..01defb167887 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -87,7 +87,8 @@ void test_reduce_parallel1(IteratorTag)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result i.e. correct computation
-    using base_iterator_det = std::vector<FloatTypeDeterministic>::iterator;
+    using base_iterator_det =
+        typename std::vector<FloatTypeDeterministic>::iterator;
     using iterator_det = test::test_iterator<base_iterator_det, IteratorTag>;
 
     using base_iterator_ndet =
@@ -109,20 +110,20 @@ void test_reduce_parallel1(IteratorTag)
         return v1 + v2;
     };
 
-    FloatTypeDeterministic r1 = hpx::reduce_deterministic(hpx::execution::par,
-        iterator_det(std::begin(deterministic)),
+    FloatTypeDeterministic r1 = hpx::experimental::reduce_deterministic(
+        hpx::execution::par, iterator_det(std::begin(deterministic)),
         iterator_det(std::end(deterministic)), val_det, op);
 
     // verify values
-    // FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par,
-    //     iterator_ndet(std::begin(nondeterministic)),
-    //     iterator_ndet(std::end(nondeterministic)), val_non_det, op);
+    FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::par,
+        iterator_ndet(std::begin(nondeterministic)),
+        iterator_ndet(std::end(nondeterministic)), val_non_det, op);
 
     FloatTypeNonDeterministic r3 = std::accumulate(
         nondeterministic.begin(), nondeterministic.end(), val_non_det);
 
     HPX_TEST_EQ(r1, r3);
-    // HPX_TEST_EQ(r2, r3);
+    HPX_TEST_EQ(r2, r3);
 }
 
 template <typename IteratorTag, typename FloatTypeDeterministic,
@@ -226,10 +227,10 @@ void test_reduce1()
 {
     using namespace hpx::execution;
 
-    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
-    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
-    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
-    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+    // test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    // test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    // test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    // test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
     test_reduce_parallel1<IteratorTag, float, float, 1000>(IteratorTag());
 }
 
@@ -238,16 +239,16 @@ void test_reduce2()
 {
     using namespace hpx::execution;
 
-    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
-    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+    // test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    // test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
 }
 
 void reduce_test1()
 {
     test_reduce1<std::random_access_iterator_tag>();
-    test_reduce2<std::random_access_iterator_tag>();
-    test_reduce1<std::forward_iterator_tag>();
-    test_reduce2<std::forward_iterator_tag>();
+    // test_reduce2<std::random_access_iterator_tag>();
+    // test_reduce1<std::forward_iterator_tag>();
+    // test_reduce2<std::forward_iterator_tag>();
 }
 
 ///////////////////////////////////////////////////////////////////////////////

From 1250ec86b5862f08eaf585183c52a4381b98427e Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 13:12:34 -0600
Subject: [PATCH 09/19] fix: when comparing the max_val it does not update to
 accommodate a value equal to itself

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../detail/reduce_deterministic.hpp           | 25 ++++++++-----------
 .../algorithms/reduce_deterministic.hpp       |  9 +++----
 .../unit/algorithms/reduce_deterministic.cpp  | 18 ++++++-------
 3 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index dc3ee96d6d51..6a4f379c7d7c 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -36,16 +36,16 @@ namespace hpx::parallel::detail {
         {
             /// TODO: Put constraint on Reduce to be a binary plus operator
             (void) r;
-            // hpx::parallel::detail::rfa::RFA_bins<T> bins;
-            // bins.initialize_bins();
-            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+
+            // __rfa_bin_host_buffer__ should be initialized by the frontend of
+            // this method
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
             rfa.set_max_abs_val(init);
             rfa.unsafe_add(init);
             rfa.renorm();
             size_t count = 0;
-            T max_val = std::abs(*first);
+            T max_val = std::abs(std::numeric_limits<T>::min());
             for (auto e = first; e != last; ++e)
             {
                 T temp_max_val = std::abs(static_cast<T>(*e));
@@ -79,15 +79,14 @@ namespace hpx::parallel::detail {
                 ExPolicy&&, InIterB first, std::size_t partition_size, T init,
                 std::true_type&&)
         {
-            // hpx::parallel::detail::rfa::RFA_bins<T> bins;
-            // bins.initialize_bins();
-            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+            // __rfa_bin_host_buffer__ should be initialized by the frontend of
+            // this method
 
             hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator<T> rfa;
             rfa.zero();
             rfa += init;
             size_t count = 0;
-            T max_val = std::abs(*first);
+            T max_val = std::abs(std::numeric_limits<T>::min());
             std::size_t partition_size_lim = 0;
             for (auto e = first; partition_size_lim < partition_size;
                 partition_size_lim++, e++)
@@ -106,8 +105,7 @@ namespace hpx::parallel::detail {
                     count = 0;
                 }
             }
-            printf("rfa res conv: %f\n", rfa.conv());
-            return std::move(rfa);
+            return rfa;
         }
 
         template <typename InIterB, typename T>
@@ -115,9 +113,8 @@ namespace hpx::parallel::detail {
             sequential_reduce_deterministic_rfa_t, ExPolicy&&, InIterB first,
             std::size_t partition_size, T init, std::false_type&&)
         {
-            // hpx::parallel::detail::rfa::RFA_bins<typename T::ftype> bins;
-            // bins.initialize_bins();
-            // std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins));
+            // __rfa_bin_host_buffer__ should be initialized by the frontend of
+            // this method
 
             T rfa;
             rfa.zero();
@@ -126,9 +123,7 @@ namespace hpx::parallel::detail {
             for (auto e = first; partition_size_lim < partition_size;
                 partition_size_lim++, e++)
             {
-                printf("rfa: %f rfa val before:%f\n", (*e).conv(), rfa.conv());
                 rfa += (*e);
-                printf("rfa: %f rfa val:%f\n", (*e).conv(), rfa.conv());
             }
             return rfa;
         }
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index c508403e44eb..2427338d2120 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -397,6 +397,7 @@ namespace hpx::parallel {
             static constexpr T sequential(ExPolicy&& policy, InIterB first,
                 InIterE last, T_&& init, Reduce&& r)
             {
+                // TODO: abstract initializing memory
                 hpx::parallel::detail::rfa::RFA_bins<T_> bins;
                 bins.initialize_bins();
                 std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__,
@@ -419,6 +420,7 @@ namespace hpx::parallel {
                         HPX_FORWARD(T_, init));
                 }
 
+                // TODO: abstract initializing memory
                 hpx::parallel::detail::rfa::RFA_bins<T_> bins;
                 bins.initialize_bins();
                 std::memcpy(hpx::parallel::detail::rfa::__rfa_bin_host_buffer__,
@@ -428,6 +430,7 @@ namespace hpx::parallel {
                     -> hpx::parallel::detail::rfa::
                         ReproducibleFloatingAccumulator<T_> {
                             T_ val = *part_begin;
+                            // Assumed that __rfa_bin_host_buffer__ is initiallized
                             return hpx::parallel::detail::
                                 sequential_reduce_deterministic_rfa<ExPolicy>(
                                     HPX_FORWARD(ExPolicy, policy), ++part_begin,
@@ -440,16 +443,12 @@ namespace hpx::parallel {
                         T_>>::call(HPX_FORWARD(ExPolicy, policy), first,
                     detail::distance(first, last), HPX_MOVE(f1),
                     hpx::unwrapping([policy, init](auto&& results) -> T_ {
-                        // Assumed that
+                        // Assumed that __rfa_bin_host_buffer__ is initiallized
                         hpx::parallel::detail::rfa::
                             ReproducibleFloatingAccumulator<T_>
                                 rfa;
                         rfa.zero();
                         rfa += init;
-                        for (auto e : results)
-                        {
-                            printf("rfa results %f\n", e.conv());
-                        }
                         return hpx::parallel::detail::
                             sequential_reduce_deterministic_rfa<ExPolicy>(
                                 HPX_FORWARD(ExPolicy, policy),
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 01defb167887..6fada2ab597a 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -227,10 +227,10 @@ void test_reduce1()
 {
     using namespace hpx::execution;
 
-    // test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
-    // test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
-    // test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
-    // test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
     test_reduce_parallel1<IteratorTag, float, float, 1000>(IteratorTag());
 }
 
@@ -239,16 +239,16 @@ void test_reduce2()
 {
     using namespace hpx::execution;
 
-    // test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
-    // test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, float, 1000>(IteratorTag());
+    test_reduce_determinism<IteratorTag, double, 1000>(IteratorTag());
 }
 
 void reduce_test1()
 {
     test_reduce1<std::random_access_iterator_tag>();
-    // test_reduce2<std::random_access_iterator_tag>();
-    // test_reduce1<std::forward_iterator_tag>();
-    // test_reduce2<std::forward_iterator_tag>();
+    test_reduce2<std::random_access_iterator_tag>();
+    test_reduce1<std::forward_iterator_tag>();
+    test_reduce2<std::forward_iterator_tag>();
 }
 
 ///////////////////////////////////////////////////////////////////////////////

From 2e8eea15aaabb96d840a2f841fd29aa83b9b4039 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 13:51:03 -0600
Subject: [PATCH 10/19] Add benchmark entry to CMakeLists

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../tests/performance/CMakeLists.txt          |  1 +
 .../benchmark_reduce_deterministic.cpp        | 23 +++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/libs/core/algorithms/tests/performance/CMakeLists.txt b/libs/core/algorithms/tests/performance/CMakeLists.txt
index d74788a9b47f..96ce826dc742 100644
--- a/libs/core/algorithms/tests/performance/CMakeLists.txt
+++ b/libs/core/algorithms/tests/performance/CMakeLists.txt
@@ -16,6 +16,7 @@ set(benchmarks
     benchmark_partial_sort_parallel
     benchmark_partition
     benchmark_partition_copy
+    benchmark_reduce_deterministic
     benchmark_remove
     benchmark_remove_if
     benchmark_scan_algorithms
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
index 5a267dd6a634..c6604aaea44d 100644
--- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -33,25 +33,28 @@ T get_rand(T LO = (std::numeric_limits<T>::min)(),
 }
 
 ///////////////////////////////////////////////////////////////////////////////
-
-void bench_reduce_deterministic(const auto& policy,
-    const auto& deterministic_shuffled, const auto& val_det, const auto& op)
+template <typename PolicyT, typename IteratorT, typename InitVal, typename Op>
+void bench_reduce_deterministic(const PolicyT& policy,
+    const IteratorT& deterministic_shuffled, const InitVal& val_det,
+    const Op& op)
 {
     // check if different type for deterministic and nondeeterministic
     // and same result
 
-    auto r1_shuffled =
-        hpx::reduce_deterministic(policy, std::begin(deterministic_shuffled),
-            std::end(deterministic_shuffled), val_det, op);
+    auto r1_shuffled = hpx::experimental::reduce_deterministic(policy,
+        std::begin(deterministic_shuffled), std::end(deterministic_shuffled),
+        val_det, op);
 
     HPX_UNUSED(r1_shuffled);
 }
 
-void bench_reduce(const auto& policy, const auto& deterministic_shuffled,
-    const auto& val_det, const auto& op)
+template <typename PolicyT, typename IteratorT, typename InitVal, typename Op>
+void bench_reduce(const PolicyT& policy,
+    const IteratorT& non_deterministic_shuffled, const InitVal& val_det,
+    const Op& op)
 {
-    auto r = hpx::reduce(policy, (std::begin(deterministic_shuffled)),
-        (std::end(deterministic_shuffled)), val_det, op);
+    auto r = hpx::reduce(policy, (std::begin(non_deterministic_shuffled)),
+        (std::end(non_deterministic_shuffled)), val_det, op);
 
     HPX_UNUSED(r);
 }

From bedbf56c36b32ef2598e1a003ec89497c90f7306 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 14:01:15 -0600
Subject: [PATCH 11/19] fix: missing braces around initializer

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
index b8f5da5f233a..77e43fd5894b 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp
@@ -824,7 +824,7 @@ namespace hpx::parallel::detail::rfa {
         ///Set the binned fp to zero
         void zero()
         {
-            data = {0};
+            data = {{0}};
         }
 
         ///Return the fold of the binned fp

From ca73d3d351bd794fde7b2c398b3d66e8bf84e004 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 16:02:17 -0600
Subject: [PATCH 12/19] Remove unnecessary include

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../include/hpx/parallel/algorithms/reduce_deterministic.hpp     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 2427338d2120..56135161495f 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -10,7 +10,6 @@
 
 #pragma once
 
-#include "detail/reduce_deterministic.hpp"
 #if defined(DOXYGEN)
 
 namespace hpx {

From aba736e03edb39d747bec839904e40b86383debd Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 16:04:02 -0600
Subject: [PATCH 13/19] Remove sanitizer added for testing purpose

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
index 76dc5fcd9806..559ee830030e 100644
--- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
+++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt
@@ -246,7 +246,3 @@ foreach(test ${tests})
     "modules.algorithms.algorithms" ${test} ${${test}_PARAMETERS}
   )
 endforeach()
-
-target_compile_options(reduce_deterministic_test PRIVATE -fsanitize=address)
-
-target_link_options(reduce_deterministic_test PRIVATE -fsanitize=address)
\ No newline at end of file

From 6011678d0335649e79f53b0d20c74ec3b3d5ab8c Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 16:06:22 -0600
Subject: [PATCH 14/19] Remove unnecessary overload added

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/debugging/src/print.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libs/core/debugging/src/print.cpp b/libs/core/debugging/src/print.cpp
index 8a01d9574853..3d7cf5da2aa0 100644
--- a/libs/core/debugging/src/print.cpp
+++ b/libs/core/debugging/src/print.cpp
@@ -57,10 +57,6 @@ namespace hpx::debug {
             std::ostream&, std::int32_t const&, int);
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::int64_t const&, int);
-#ifdef __APPLE__
-        template HPX_CORE_EXPORT void print_dec(
-            std::ostream&, unsigned long const&, int);
-#endif
         template HPX_CORE_EXPORT void print_dec(
             std::ostream&, std::uint64_t const&, int);
 

From 21339d5d3f72f2df37d30974fcba00b61caa75f9 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 16:06:22 -0600
Subject: [PATCH 15/19] Remove unnecessary type cast and overload

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 libs/core/concurrency/tests/unit/tagged_ptr.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/concurrency/tests/unit/tagged_ptr.cpp b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
index d86fc5775415..b29652a3ede1 100644
--- a/libs/core/concurrency/tests/unit/tagged_ptr.cpp
+++ b/libs/core/concurrency/tests/unit/tagged_ptr.cpp
@@ -25,7 +25,7 @@ void tagged_ptr_test()
         i = j;
 
         HPX_TEST_EQ(i.get_ptr(), &b);
-        HPX_TEST_EQ(i.get_tag(), 1UL);
+        HPX_TEST_EQ(i.get_tag(), 1);
     }
 
     {
@@ -43,7 +43,7 @@ void tagged_ptr_test()
 
     {
         tagged_ptr<int> j(&a, max_tag);
-        HPX_TEST_EQ(j.get_next_tag(), 0UL);
+        HPX_TEST_EQ(j.get_next_tag(), 0);
     }
 
     {

From 5af23aec6f23a636be62922e59da9807444c2729 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Sun, 2 Feb 2025 16:14:28 -0600
Subject: [PATCH 16/19] Address clang-format modifications

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../hpx/parallel/algorithms/detail/reduce_deterministic.hpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 6a4f379c7d7c..3fc0ec2e4344 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -89,7 +89,7 @@ namespace hpx::parallel::detail {
             T max_val = std::abs(std::numeric_limits<T>::min());
             std::size_t partition_size_lim = 0;
             for (auto e = first; partition_size_lim < partition_size;
-                partition_size_lim++, e++)
+                 partition_size_lim++, e++)
             {
                 T temp_max_val = std::abs(static_cast<T>(*e));
                 if (max_val < temp_max_val)
@@ -121,7 +121,7 @@ namespace hpx::parallel::detail {
             rfa += init;
             std::size_t partition_size_lim = 0;
             for (auto e = first; partition_size_lim < partition_size;
-                partition_size_lim++, e++)
+                 partition_size_lim++, e++)
             {
                 rfa += (*e);
             }

From 1c85d5d6a2f2a3ce5a2837d587478fff5bf83fec Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 4 Feb 2025 21:56:30 -0600
Subject: [PATCH 17/19] Address inspect tool

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../hpx/parallel/algorithms/detail/reduce_deterministic.hpp   | 4 ++--
 .../include/hpx/parallel/algorithms/reduce_deterministic.hpp  | 1 +
 .../tests/performance/benchmark_reduce_deterministic.cpp      | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
index 3fc0ec2e4344..2c2aeabf4ce6 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp
@@ -45,7 +45,7 @@ namespace hpx::parallel::detail {
             rfa.unsafe_add(init);
             rfa.renorm();
             size_t count = 0;
-            T max_val = std::abs(std::numeric_limits<T>::min());
+            T max_val = std::abs((std::numeric_limits<T>::min)());
             for (auto e = first; e != last; ++e)
             {
                 T temp_max_val = std::abs(static_cast<T>(*e));
@@ -86,7 +86,7 @@ namespace hpx::parallel::detail {
             rfa.zero();
             rfa += init;
             size_t count = 0;
-            T max_val = std::abs(std::numeric_limits<T>::min());
+            T max_val = std::abs((std::numeric_limits<T>::min)());
             std::size_t partition_size_lim = 0;
             for (auto e = first; partition_size_lim < partition_size;
                  partition_size_lim++, e++)
diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
index 56135161495f..fddfd2bff1a3 100644
--- a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
+++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp
@@ -371,6 +371,7 @@ namespace hpx {
 
 #include <algorithm>
 #include <cstddef>
+#include <cstring>
 #include <iterator>
 #include <type_traits>
 #include <utility>
diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
index c6604aaea44d..a91387234830 100644
--- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -16,6 +16,7 @@
 #include <hpx/parallel/algorithms/reduce.hpp>
 #include <hpx/parallel/algorithms/reduce_deterministic.hpp>
 
+#include <iostream>
 #include <numeric>
 #include <random>
 #include <vector>

From 04af5ec0994c64e785a9c29211895a771c8189c2 Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 4 Feb 2025 23:36:39 -0600
Subject: [PATCH 18/19] Add double and float separate benchmarks, more tests

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .../benchmark_reduce_deterministic.cpp        | 155 ++++++++++++------
 .../unit/algorithms/reduce_deterministic.cpp  |   7 +-
 2 files changed, 113 insertions(+), 49 deletions(-)

diff --git a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
index a91387234830..f2315f988750 100644
--- a/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/performance/benchmark_reduce_deterministic.cpp
@@ -79,56 +79,117 @@ int hpx_main(hpx::program_options::variables_map& vm)
     }
 
     {
-        using FloatTypeDeterministic = float;
-        std::size_t LEN = vector_size;
-
-        constexpr FloatTypeDeterministic num_bounds_det =
-            std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 : 1000000.0;
-
-        std::vector<FloatTypeDeterministic> deterministic(LEN);
-
-        for (size_t i = 0; i < LEN; ++i)
-        {
-            deterministic[i] = get_rand<FloatTypeDeterministic>(
-                -num_bounds_det, num_bounds_det);
-        }
-
-        std::vector<FloatTypeDeterministic> deterministic_shuffled =
-            deterministic;
-
-        std::shuffle(
-            deterministic_shuffled.begin(), deterministic_shuffled.end(), gen);
-
-        FloatTypeDeterministic val_det(41.999);
-
-        auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
-            return v1 + v2;
-        };
-        {
-            hpx::util::perftests_report("reduce", "seq", test_count, [&]() {
-                bench_reduce(
-                    hpx::execution::seq, deterministic_shuffled, val_det, op);
-            });
-        }
-        {
-            hpx::util::perftests_report("reduce", "par", test_count, [&]() {
-                bench_reduce(
-                    hpx::execution::par, deterministic_shuffled, val_det, op);
-            });
-        }
         {
-            hpx::util::perftests_report(
-                "reduce deterministic", "seq", test_count, [&]() {
-                    bench_reduce_deterministic(hpx::execution::seq,
-                        deterministic_shuffled, val_det, op);
-                });
+            using FloatTypeDeterministic = float;
+            std::size_t LEN = vector_size;
+
+            constexpr FloatTypeDeterministic num_bounds_det =
+                std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 :
+                                                                1000000.0;
+
+            std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+            for (size_t i = 0; i < LEN; ++i)
+            {
+                deterministic[i] = get_rand<FloatTypeDeterministic>(
+                    -num_bounds_det, num_bounds_det);
+            }
+
+            std::vector<FloatTypeDeterministic> deterministic_shuffled =
+                deterministic;
+
+            std::shuffle(deterministic_shuffled.begin(),
+                deterministic_shuffled.end(), gen);
+
+            FloatTypeDeterministic val_det(41.999);
+
+            auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+                return v1 + v2;
+            };
+            {
+                hpx::util::perftests_report(
+                    "fl reduce", "seq", test_count, [&]() {
+                        bench_reduce(hpx::execution::seq,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "fl reduce", "par", test_count, [&]() {
+                        bench_reduce(hpx::execution::par,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "fl reduce deterministic", "seq", test_count, [&]() {
+                        bench_reduce_deterministic(hpx::execution::seq,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "fl reduce deterministic", "par", test_count, [&]() {
+                        bench_reduce_deterministic(hpx::execution::par,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
         }
         {
-            hpx::util::perftests_report(
-                "reduce deterministic", "par", test_count, [&]() {
-                    bench_reduce_deterministic(hpx::execution::par,
-                        deterministic_shuffled, val_det, op);
-                });
+            using FloatTypeDeterministic = double;
+            std::size_t LEN = vector_size;
+
+            constexpr FloatTypeDeterministic num_bounds_det =
+                std::is_same_v<FloatTypeDeterministic, float> ? 1000.0 :
+                                                                1000000.0;
+
+            std::vector<FloatTypeDeterministic> deterministic(LEN);
+
+            for (size_t i = 0; i < LEN; ++i)
+            {
+                deterministic[i] = get_rand<FloatTypeDeterministic>(
+                    -num_bounds_det, num_bounds_det);
+            }
+
+            std::vector<FloatTypeDeterministic> deterministic_shuffled =
+                deterministic;
+
+            std::shuffle(deterministic_shuffled.begin(),
+                deterministic_shuffled.end(), gen);
+
+            FloatTypeDeterministic val_det(41.999);
+
+            auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) {
+                return v1 + v2;
+            };
+            {
+                hpx::util::perftests_report(
+                    "dbl reduce", "seq", test_count, [&]() {
+                        bench_reduce(hpx::execution::seq,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "dbl reduce", "par", test_count, [&]() {
+                        bench_reduce(hpx::execution::par,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "dbl reduce deterministic", "seq", test_count, [&]() {
+                        bench_reduce_deterministic(hpx::execution::seq,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
+            {
+                hpx::util::perftests_report(
+                    "dbl reduce deterministic", "par", test_count, [&]() {
+                        bench_reduce_deterministic(hpx::execution::par,
+                            deterministic_shuffled, val_det, op);
+                    });
+            }
         }
 
         hpx::util::perftests_print_times();
diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
index 6fada2ab597a..5dcf41f4d9c5 100644
--- a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
+++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp
@@ -122,8 +122,8 @@ void test_reduce_parallel1(IteratorTag)
     FloatTypeNonDeterministic r3 = std::accumulate(
         nondeterministic.begin(), nondeterministic.end(), val_non_det);
 
-    HPX_TEST_EQ(r1, r3);
-    HPX_TEST_EQ(r2, r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r1), r3);
+    HPX_TEST_EQ(static_cast<FloatTypeNonDeterministic>(r2), r3);
 }
 
 template <typename IteratorTag, typename FloatTypeDeterministic,
@@ -232,6 +232,9 @@ void test_reduce1()
     test_reduce1<IteratorTag, float, double, 1000>(IteratorTag());
     test_reduce1<IteratorTag, double, double, 1000>(IteratorTag());
     test_reduce_parallel1<IteratorTag, float, float, 1000>(IteratorTag());
+    test_reduce_parallel1<IteratorTag, float, double, 1000>(IteratorTag());
+    test_reduce_parallel1<IteratorTag, double, float, 1000>(IteratorTag());
+    test_reduce_parallel1<IteratorTag, double, double, 1000>(IteratorTag());
 }
 
 template <typename IteratorTag>

From 2319abe9915a4f151f11806ec0863ff5f47eee6b Mon Sep 17 00:00:00 2001
From: Shreyas Atre <shreyasatre16@gmail.com>
Date: Tue, 11 Feb 2025 19:23:59 -0600
Subject: [PATCH 19/19] Fix macos CI

Signed-off-by: Shreyas Atre <shreyasatre16@gmail.com>
---
 .github/workflows/macos_debug_fetch_hwloc.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml
index c1a54b4186e4..7b0044941751 100644
--- a/.github/workflows/macos_debug_fetch_hwloc.yml
+++ b/.github/workflows/macos_debug_fetch_hwloc.yml
@@ -36,6 +36,7 @@ jobs:
               -DHPX_WITH_VERIFY_LOCKS=ON \
               -DHPX_WITH_VERIFY_LOCKS_BACKTRACE=ON \
               -DHPX_WITH_CHECK_MODULE_DEPENDENCIES=ON
+          ln -s "$(which aclocal)" /opt/homebrew/bin/aclocal-1.16
           cd build/_deps/hwloc-src/ && autoreconf -f -i
     - name: Build
       shell: bash