From 46f1c4e28b67894321fb044db02e4048e43fe7ed Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Wed, 18 Oct 2023 19:50:46 -0700
Subject: [PATCH 1/3] fft gpu - perf analysis remains

---
 ...-stdexec-multicore.cpp => fft-stdexec.cpp} | 156 +++++++++++-------
 apps/fft/fft.hpp                              |  15 ++
 2 files changed, 107 insertions(+), 64 deletions(-)
 rename apps/fft/{fft-stdexec-multicore.cpp => fft-stdexec.cpp} (50%)

diff --git a/apps/fft/fft-stdexec-multicore.cpp b/apps/fft/fft-stdexec.cpp
similarity index 50%
rename from apps/fft/fft-stdexec-multicore.cpp
rename to apps/fft/fft-stdexec.cpp
index 045bb6a..b93736a 100644
--- a/apps/fft/fft-stdexec-multicore.cpp
+++ b/apps/fft/fft-stdexec.cpp
@@ -28,78 +28,94 @@
  * commons for the fft codes
  */
 
+#define FFT_STDEXEC
 #include "fft.hpp"
 
+using namespace nvexec;
+
 using any_void_sender =
       any_sender_of<stdexec::set_value_t(), stdexec::set_stopped_t(),
                     stdexec::set_error_t(std::exception_ptr)>;
 
 //
-// recursive multicore fft
+// fft algorithm
 //
-any_void_sender fft_multicore(sender auto &&snd, data_t *x, int lN, const int N, int max_threads)
+std::vector<data_t> fft(scheduler auto sch, data_t *x, const int N, const int max_threads)
 {
-    // current merge stride
-    int stride = N/lN;
-
-    // to check parallelism
-    //std::cout << "lN = " << lN << ", from tid: " << std::this_thread::get_id() << std::endl;
+    std::vector<data_t> x_rev(N);
+    std::vector<uint32_t> ind(N);
 
-    // if parallelism > max threads => serial
-    if (stride >= max_threads)
-    {
-        // TODO: can this be improved? Putting it in ex::then doesn't sync
-        fft_serial(x, lN, N);
-        return just();
-    }
+    data_t *x_r = x_rev.data();
+    uint32_t *id = ind.data();
 
-    // base case
-    if (lN == 2)
-    {
-        // TODO: can this be improved? Putting it in ex::then doesn't sync
-        auto x_0 = x[0] + x[1]* WNk(N, 0);
-        x[1] = x[0] - x[1]* WNk(N, 0);
-        x[0] = x_0;
+    int shift = 32 - ilog2(N);
 
-        return just();
-    }
+    ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id)
+        | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){
+            id[k] = reverse_bits32(k) >> shift;
+            x_r[k] = x[id[k]];
+        })
+        | ex::then([](auto &&...){});
 
-    // vectors for even and odd index elements
-    std::vector<data_t> e(lN/2);
-    std::vector<data_t> o(lN/2);
+    ex::sync_wait(twiddle);
 
-    // array to use in bulk
-    std::array<data_t *, 2> dat{e.data(), o.data()};
+    // niterations
+    int niters = ilog2(N);
+    // local merge partition size
+    int lN = 2;
 
-    // local thread pool and scheduler
-    exec::static_thread_pool pool_loc{std::min(lN/2, max_threads)};
-    ex::sender auto snd_loc = schedule(pool_loc.get_scheduler());
+    // set cout precision
+    std::cout << std::fixed << std::setprecision(1);
 
-    // copy even and odd indexes to vectors and split sender
-    ex::sender auto merge =
-        ex::bulk(snd, lN/2, [&](int k){
-            // copy data into vectors
-            e[k] = x[2*k];
-            o[k] = x[2*k+1];
-        })
-        | ex::bulk(2, [=,&dat](int k){
-            // NVC++ 23.1: passing `snd` here results in (nvc++-Fatal-/path/to/tools/cpp1 TERMINATED by signal 11)
-            // NVC++ 23.7 goes in forever loop
+    // transfer_just sender
+    ex::sender auto tx = ex::transfer_just(sch, x_r);
 
-            // compute N/2 pt FFT on even and odd in bulk
-            fft_multicore(snd_loc, dat[k], lN/2, N, max_threads);
-        })
-        | ex::bulk(lN/2, [&](int k){
+    for (int k = 0; k < niters; k++, lN*=2)
+    {
+        std::cout << "FFT progress: " << (100.0 * k)/niters << "%" << std::endl;
+
+        // number of partitions
+        int stride = N/lN;
+
+        if (lN < max_threads)
+        {
+            //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << stride << ", each thread = " << lN/2 << std::endl;
+            ex::sender auto merge = tx | ex::bulk(stride, [=](auto k, auto y)
+            {
+                // combine even and odd FFTs
+                for (int i = 0; i < lN/2; i++)
+                {
+                    auto e = i + k*lN;
+                    auto o = i + k*lN + lN/2;
+                    auto tmp     = y[e] + y[o] * WNk(N, i * stride);
+                    y[o] = y[e] - y[o] * WNk(N, i * stride);
+                    y[e] = tmp;
+                }
+            });
+
+            ex::sync_wait(std::move(merge));
+        }
+        else
+        {
+            //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << lN/2 << ", x times called = " << stride << std::endl;
             // combine even and odd FFTs
-            x[k] = e[k] + o[k] * WNk(N, k * stride);
-            x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
-        });
-
-    // wait to complete
-    ex::sync_wait(std::move(merge));
+            for (int i = 0; i < stride; i++)
+            {
+                ex::sender auto merge = tx | ex::bulk(lN/2, [=](auto k, auto y)
+                {
+                    auto e = k + i*lN;
+                    auto o = k + i*lN + lN/2;
+                    auto tmp = y[e] + y[o] * WNk(N, k * stride);
+                    y[o] = y[e] - y[o] * WNk(N, k * stride);
+                    y[e] = tmp;
+                });
+
+                ex::sync_wait(std::move(merge));
+            }
+        }
+    }
 
-    // return void sender
-    return just();
+    return x_rev;
 }
 
 //
@@ -125,6 +141,7 @@ int main(int argc, char* argv[])
     bool print_sig = args.print_sig;
     bool print_time = args.print_time;
     bool validate = args.validate;
+    std::string sched = args.sch;
 
     // x[n] signal
     sig_t x_n(N, sig_type);
@@ -137,9 +154,6 @@ int main(int argc, char* argv[])
         x_n.resize(N);
     }
 
-    // y[n] = fft(x[n]);
-    sig_t y_n(x_n);
-
     if (print_sig)
     {
         std::cout << std::endl << "x[n] = ";
@@ -147,18 +161,32 @@ int main(int argc, char* argv[])
         std::cout << std::endl;
     }
 
-    // niterations
-    int niters = ilog2(N);
-
-    // thread pool and scheduler
-    exec::static_thread_pool pool{max_threads};
-    scheduler auto sched = pool.get_scheduler();
+    // y[n] = fft(x[n]);
+    std::vector<data_t> y;
 
     // start the timer here
     Timer timer;
 
-    // fft radix-2 algorithm
-    fft_multicore(schedule(sched), y_n.data(), N, N, max_threads);
+    // initialize stdexec scheduler
+    sch_t scheduler = get_sch_enum(sched);
+
+    // launch with appropriate stdexec scheduler
+    switch (scheduler) {
+        case sch_t::CPU:
+            y = fft(exec::static_thread_pool(max_threads).get_scheduler(), x_n.data(), N, max_threads);
+            break;
+        case sch_t::GPU:
+            y = fft(nvexec::stream_context().get_scheduler(), x_n.data(), N, 1024*108);
+            break;
+        case sch_t::MULTIGPU:
+            y = fft(nvexec::multi_gpu_stream_context().get_scheduler(), x_n.data(), N, 4*1024*108);
+            break;
+        default:
+            throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
+  }
+
+    // y[n] = fft(x[n])
+    sig_t y_n(y);
 
     // stop timer
     auto elapsed = timer.stop();
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 1700a30..ae63fdd 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -34,6 +34,8 @@
 
 #include <experimental/mdspan>
 #include <stdexec/execution.hpp>
+#include <nvexec/stream_context.cuh>
+#include <nvexec/multi_gpu_context.cuh>
 #include <exec/any_sender_of.hpp>
 #include "exec/static_thread_pool.hpp"
 #include "argparse/argparse.hpp"
@@ -100,6 +102,10 @@ struct fft_params_t : public argparse::Args {
   bool& print_sig = flag("p,print", "print x[n] and X(k)");
   int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 
+#if defined(FFT_STDEXEC)
+  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+#endif  // FFT_STDEXEC
+
   bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]");
   bool& help = flag("h, help", "print help");
   bool& print_time = flag("t,time", "print fft time");
@@ -142,6 +148,15 @@ bool complex_compare(data_t a, data_t b, double error = 0.0101)
   return (r && i);
 }
 
+uint32_t reverse_bits32(uint32_t x)
+{
+    x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1);
+    x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2);
+    x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4);
+    x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8);
+    return (x >> 16) | (x << 16);
+}
+
 class signal
 {
 public:

From af2a10d4d5c1873960354d2536a2affc5953362d Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Thu, 19 Oct 2023 20:28:48 -0700
Subject: [PATCH 2/3] optimize fft, cmake cleanup, using linalg in DFT

---
 CMakeLists.txt                               |   9 +-
 apps/1d_stencil/CMakeLists.txt               |   2 +-
 apps/1d_stencil/stencil_stdexec.cpp          |  13 +-
 apps/comm-study/comm-study-no-senders.cpp    |   2 +-
 apps/comm-study/comm-study.cpp               | 142 -----------------
 apps/fft/CMakeLists.txt                      |  14 +-
 apps/fft/fft-serial.cpp                      |  71 ++++++++-
 apps/fft/fft-stdexec.cpp                     |  97 ++++++------
 apps/fft/fft.hpp                             | 154 +++++++------------
 apps/heat-equation/CMakeLists.txt            |   4 +-
 apps/heat-equation/heat-equation-stdexec.cpp |   2 +
 apps/heat-equation/heat-equation.hpp         |  17 +-
 include/commons.hpp                          |  12 +-
 13 files changed, 234 insertions(+), 305 deletions(-)
 delete mode 100644 apps/comm-study/comm-study.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c6ee0c..1a43330 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,9 +171,16 @@ endif()
 
 # need to add appropriate flags for stdexec
 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread"
+    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} -pthread"
 )
 
+# add -cudalib=cublas if -stdpar=gpu
+if (STDPAR STREQUAL "gpu")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPUSTDPAR")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UGPUSTDPAR")
+endif()
+
 # ##############################################################################
 # Add sub-directories
 # ##############################################################################
diff --git a/apps/1d_stencil/CMakeLists.txt b/apps/1d_stencil/CMakeLists.txt
index c5157d2..93f4fc2 100644
--- a/apps/1d_stencil/CMakeLists.txt
+++ b/apps/1d_stencil/CMakeLists.txt
@@ -13,7 +13,6 @@ target_include_directories(
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
-if("${STDPAR}" STREQUAL "gpu")
   # TODO, fix cmake
   add_executable(stencil_stdexec stencil_stdexec.cpp)
   target_link_libraries(stencil_stdexec stdexec)
@@ -22,6 +21,7 @@ if("${STDPAR}" STREQUAL "gpu")
     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
+if("${STDPAR}" STREQUAL "gpu")
   add_executable(stencil_cuda stencil_cuda.cpp)
   target_include_directories(
     stencil_cuda
diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp
index ace96bc..e2610a1 100644
--- a/apps/1d_stencil/stencil_stdexec.cpp
+++ b/apps/1d_stencil/stencil_stdexec.cpp
@@ -1,7 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 Weile Wei
  * Copyright (c) 2023 The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  * required approvals from the U.S. Dept. of Energy).All rights reserved.
@@ -27,8 +27,10 @@
 //
 // This example provides a stdexec implementation for the 1D stencil code.
 #include <exec/static_thread_pool.hpp>
+#if defined(GPUSTDPAR)
 #include <nvexec/multi_gpu_context.cuh>
 #include <nvexec/stream_context.cuh>
+#endif
 #include <stdexec/execution.hpp>
 
 #include "argparse/argparse.hpp"
@@ -45,7 +47,12 @@ struct args_params_t : public argparse::Args {
     bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
     bool& help = flag("h, help", "print help");
     bool& time = kwarg("t, time", "print time").set_default(true);
-    std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+    std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (GPUSTDPAR)
+                            ", gpu, multigpu"
+  #endif //GPUSTDPAR
+                            "]").set_default("cpu");
+
     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 };
 
@@ -121,12 +128,14 @@ int benchmark(args_params_t const& args) {
             case sch_t::CPU:
                 solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt);
                 break;
+#if defined(GPUSTDPAR)
             case sch_t::GPU:
                 solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt);
                 break;
             case sch_t::MULTIGPU:
                 solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt);
                 break;
+#endif // GPUSTDPAR
             default:
                 std::cerr << "Unknown scheduler type encountered." << std::endl;
                 break;
diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp
index 1550094..1377745 100644
--- a/apps/comm-study/comm-study-no-senders.cpp
+++ b/apps/comm-study/comm-study-no-senders.cpp
@@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) {
 
   // get sum(Y) - one last memcpy (not USM) D2H
   sum +=
-      std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(), [](T &val){return val * val;});
+      std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>());
 
   return sum / N;
 }
diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp
deleted file mode 100644
index 7629ce0..0000000
--- a/apps/comm-study/comm-study.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2023 The Regents of the University of California,
- * through Lawrence Berkeley National Laboratory (subject to receipt of any
- * required approvals from the U.S. Dept. of Energy).  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "commons.hpp"
-#include "exec/static_thread_pool.hpp"
-
-using namespace std;
-using namespace stdexec;
-using stdexec::sync_wait;
-
-using T = double;
-using time_point_t = std::chrono::system_clock::time_point;
-
-// must take in the pointers/vectors by reference
-template <typename P>
-auto work(P& A, P& B, P& Y, int N) {
-  T sum = 0.0;
-
-  // init A and B separately - will it cause an H2D copy?
-  sender auto s1 = then(just(),
-                        [&] {
-                          std::for_each(std::execution::par_unseq, &A[0], &A[N],
-                                        [&](T& ai) { ai = cos(M_PI / 4); });
-                        })
-                   // trigger a D2H here
-                   | then([&] {
-                       for (int i = 0; i < N / 3; i++) {
-                         // read only or read-write operations
-                         sum += A[i] / N;
-
-                         // this line if commented should not result in an H2D
-                         // after this but it does.
-                         // A[i] = sin(M_PI/4);
-                       }
-                       std::cout << std::endl;
-                     });
-
-  // will it cause an H2D here?
-  sender auto s2 = then(just(), [&] {
-    std::for_each(std::execution::par_unseq, &B[0], &B[N],
-                  [&](T& bi) { bi = sin(M_PI / 6); });
-  });
-
-  // will s1 and s2 execute in parallel or not?
-  sync_wait(when_all(std::move(s1), std::move(s2)));
-
-  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
-  sender auto s3 =
-      then(just(),
-           [&] {
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &A[0], [&](T& ai, T& bi) { return ai + bi; });
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &Y[0], [&](T& ai, T& bi) {
-                              return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi);
-                            });
-           })
-      // should trigger a D2H copy of N/3 elements
-      | then([&] {
-          for (int i = 0; i < N / 3; i++)
-            sum += Y[i] / N;
-
-          std::cout << std::endl;
-        })
-      // get sum(Y) - wonder if there is another H2D as we only read it in the
-      // last step
-      | then([&] {
-          return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0,
-                             std::plus<T>());
-        });
-
-  auto [val] = sync_wait(s3).value();
-
-  return sum += val;
-}
-
-int main(int argc, char* argv[]) {
-  constexpr int N = 1e9;
-  time_point_t mark = std::chrono::system_clock::now();
-  auto es =
-      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-          .count();
-  T sum = 0.0;
-
-#if 1  // 0 if only arrays
-  std::vector<T> A(N);
-  std::vector<T> B(N);
-  std::vector<T> Y(N);
-
-  mark = std::chrono::system_clock::now();
-  sum = work(A, B, Y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
-
-  std::cout << fixed << "sum: " << sum << "\n";
-#endif
-
-#if 1  // 0 if only vectors
-
-  // allocate memory - can we just allocate it on device only?
-  T* a = new T[N];
-  T* b = new T[N];
-  T* y = new T[N];
-
-  sum = 0;
-  mark = std::chrono::system_clock::now();
-  sum = work(a, b, y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
-            << std::endl;
-
-  // do not use scientific notation
-  std::cout << fixed << "sum: " << sum << "\n";
-#endif
-
-  return 0;
-}
\ No newline at end of file
diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt
index 25b80ce..10e89c1 100644
--- a/apps/fft/CMakeLists.txt
+++ b/apps/fft/CMakeLists.txt
@@ -2,6 +2,11 @@ project(fft LANGUAGES CXX)
 
 file(GLOB CPP_SOURCES "*.cpp")
 
+# add -cudalib=cublas if -stdpar=gpu
+if (STDPAR STREQUAL "gpu")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas")
+endif()
+
 foreach(source_file ${CPP_SOURCES})
   if(NOT STDPAR STREQUAL "gpu")
     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}"
@@ -18,16 +23,19 @@ foreach(source_file ${CPP_SOURCES})
   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
 
   # add dependency on argparse
-  add_dependencies(${exec_name} argparse magic_enum)
+  add_dependencies(${exec_name} argparse)
 
   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
                                                         LINKER_LANGUAGE CXX)
   target_include_directories(
     ${exec_name}
     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
-            ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+            ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+  # uncomment only if using nvc++/23.1 - no need if nvc++/23.7
+  # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64)
 
-  target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec)
+  target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas)
 
   set_target_properties(
     ${exec_name}
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index 38082e4..7400023 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -30,6 +30,67 @@
 
 #include "fft.hpp"
 
+//
+// serial fft function
+//
+std::vector<data_t> fft_serial(data_t *x, const int N, bool debug = false)
+{
+    std::vector<data_t> x_r(N);
+    std::vector<uint32_t> id(N);
+
+    // bit shift
+    int shift = 32 - ilog2(N);
+
+    // twiddle data in x[n]
+    for (int k = 0; k < N; k++)
+    {
+        id[k] = reverse_bits32(k) >> shift;
+        x_r[k] = x[id[k]];
+    }
+
+    // niterations
+    int niters = ilog2(N);
+    // local merge partition size
+    int lN = 2;
+
+    // set cout precision
+    std::cout << std::fixed << std::setprecision(1);
+
+    std::cout << "FFT progress: ";
+
+    for (int k = 0; k < niters; k++, lN*=2)
+    {
+        std::cout << (100.0 * k)/niters << "%.." << std::flush;
+
+        static Timer dtimer;
+
+        // number of partitions
+        int nparts = N/lN;
+        int tpp = lN/2;
+
+        if (debug)
+            dtimer.start();
+
+        // merge
+        for (int k = 0; k < N/2; k++)
+        {
+            // compute indices
+            int  e   = (k/tpp)*lN + (k % tpp);
+            auto o   = e + tpp;
+            auto i   = (k % tpp);
+            auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts);
+            x_r[o]     = x_r[e] - x_r[o] * WNk(N, i * nparts);
+            x_r[e]     = tmp;
+        }
+
+        if (debug)
+        std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl;
+    }
+
+    std::cout << "100%" << std::endl;
+    return x_r;
+}
+
 //
 // simulation
 //
@@ -64,9 +125,6 @@ int main(int argc, char* argv[])
         x_n.resize(N);
     }
 
-    // y[n] = fft(x[n]);
-    sig_t y_n(x_n);
-
     if (print_sig)
     {
         std::cout << std::endl << "x[n] = ";
@@ -81,7 +139,10 @@ int main(int argc, char* argv[])
     Timer timer;
 
     // fft radix-2 algorithm
-    fft_serial(y_n.data(), N, N);
+    auto &&y = fft_serial(x_n.data(), N, args.debug);
+
+    // y[n] = fft(x[n]);
+    sig_t y_n(y);
 
     // stop timer
     auto elapsed = timer.stop();
@@ -101,7 +162,7 @@ int main(int argc, char* argv[])
     // validate the recursively computed fft
     if (validate)
     {
-        if (x_n.isFFT(y_n))
+        if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler()))
             std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl;
         else
             std::cout << "FAILED: y[n] != fft(x[n])" << std::endl;
diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp
index b93736a..a725f6b 100644
--- a/apps/fft/fft-stdexec.cpp
+++ b/apps/fft/fft-stdexec.cpp
@@ -31,16 +31,10 @@
 #define FFT_STDEXEC
 #include "fft.hpp"
 
-using namespace nvexec;
-
-using any_void_sender =
-      any_sender_of<stdexec::set_value_t(), stdexec::set_stopped_t(),
-                    stdexec::set_error_t(std::exception_ptr)>;
-
 //
 // fft algorithm
 //
-std::vector<data_t> fft(scheduler auto sch, data_t *x, const int N, const int max_threads)
+std::vector<data_t> fft(data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false)
 {
     std::vector<data_t> x_rev(N);
     std::vector<uint32_t> ind(N);
@@ -66,55 +60,46 @@ std::vector<data_t> fft(scheduler auto sch, data_t *x, const int N, const int ma
 
     // set cout precision
     std::cout << std::fixed << std::setprecision(1);
+    std::cout << "FFT progress: ";
 
     // transfer_just sender
     ex::sender auto tx = ex::transfer_just(sch, x_r);
 
     for (int k = 0; k < niters; k++, lN*=2)
     {
-        std::cout << "FFT progress: " << (100.0 * k)/niters << "%" << std::endl;
+        std::cout << (100.0 * k)/niters << "%.." << std::flush;
+
+        static Timer dtimer;
 
         // number of partitions
-        int stride = N/lN;
+        int nparts = N/lN;
+        int tpp = lN/2;
 
-        if (lN < max_threads)
+        if (debug)
         {
-            //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << stride << ", each thread = " << lN/2 << std::endl;
-            ex::sender auto merge = tx | ex::bulk(stride, [=](auto k, auto y)
-            {
-                // combine even and odd FFTs
-                for (int i = 0; i < lN/2; i++)
-                {
-                    auto e = i + k*lN;
-                    auto o = i + k*lN + lN/2;
-                    auto tmp     = y[e] + y[o] * WNk(N, i * stride);
-                    y[o] = y[e] - y[o] * WNk(N, i * stride);
-                    y[e] = tmp;
-                }
-            });
-
-            ex::sync_wait(std::move(merge));
+            dtimer.start();
+            std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl;
         }
-        else
+
+        ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y)
         {
-            //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << lN/2 << ", x times called = " << stride << std::endl;
-            // combine even and odd FFTs
-            for (int i = 0; i < stride; i++)
-            {
-                ex::sender auto merge = tx | ex::bulk(lN/2, [=](auto k, auto y)
-                {
-                    auto e = k + i*lN;
-                    auto o = k + i*lN + lN/2;
-                    auto tmp = y[e] + y[o] * WNk(N, k * stride);
-                    y[o] = y[e] - y[o] * WNk(N, k * stride);
-                    y[e] = tmp;
-                });
-
-                ex::sync_wait(std::move(merge));
-            }
-        }
+            // compute indices
+            int  e   = (k/tpp)*lN + (k % tpp);
+            auto o   = e + tpp;
+            auto i   = (k % tpp);
+            auto tmp = y[e] + y[o] * WNk(N, i * nparts);
+            y[o]     = y[e] - y[o] * WNk(N, i * nparts);
+            y[e]     = tmp;
+        });
+
+        ex::sync_wait(std::move(merge));
+
+        if (debug)
+            std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl;
     }
 
+    std::cout << "100%" << std::endl;
+
     return x_rev;
 }
 
@@ -173,14 +158,16 @@ int main(int argc, char* argv[])
     // launch with appropriate stdexec scheduler
     switch (scheduler) {
         case sch_t::CPU:
-            y = fft(exec::static_thread_pool(max_threads).get_scheduler(), x_n.data(), N, max_threads);
+            y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug);
             break;
+#if defined(GPUSTDPAR)
         case sch_t::GPU:
-            y = fft(nvexec::stream_context().get_scheduler(), x_n.data(), N, 1024*108);
+            y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug);
             break;
         case sch_t::MULTIGPU:
-            y = fft(nvexec::multi_gpu_stream_context().get_scheduler(), x_n.data(), N, 4*1024*108);
+            y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug);
             break;
+#endif // GPUSTDPAR
         default:
             throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
   }
@@ -206,7 +193,25 @@ int main(int argc, char* argv[])
     // validate the recursively computed fft
     if (validate)
     {
-        if (x_n.isFFT(y_n))
+        bool verify = true;
+            // launch with appropriate stdexec scheduler
+        switch (scheduler) {
+            case sch_t::CPU:
+                verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler());
+                break;
+#if defined (GPUSTDPAR)
+            case sch_t::GPU:
+                verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
+                break;
+            case sch_t::MULTIGPU:
+                verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
+                break;
+#endif // GPUSTDPAR
+            default:
+                throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
+        }
+
+        if (verify)
             std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl;
         else
             std::cout << "FAILED: y[n] != fft(x[n])" << std::endl;
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index ae63fdd..73280c5 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -34,11 +34,17 @@
 
 #include <experimental/mdspan>
 #include <stdexec/execution.hpp>
-#include <nvexec/stream_context.cuh>
-#include <nvexec/multi_gpu_context.cuh>
-#include <exec/any_sender_of.hpp>
-#include "exec/static_thread_pool.hpp"
+#include <exec/static_thread_pool.hpp>
+
+#if defined(GPUSTDPAR)
+  #include <nvexec/stream_context.cuh>
+  #include <nvexec/multi_gpu_context.cuh>
+using namespace nvexec;
+#endif //GPUSTDPAR
+
+#include <experimental/linalg>
 #include "argparse/argparse.hpp"
+
 #include "commons.hpp"
 
 using namespace std;
@@ -48,10 +54,6 @@ using stdexec::sync_wait;
 
 namespace ex = stdexec;
 
-template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
-
 // 2D view
 using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
@@ -63,7 +65,7 @@ using data_t = std::complex<Real_t>;
 enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box };
 using sig_type_t = sig_type;
 
-#if defined (__NVCOMPILER)
+// map for signals
 std::map<std::string, sig_type_t> sigmap{{"square",sig_type_t::square}, {"sinusoid", sig_type_t::sinusoid}, {"triangle", sig_type_t::sawtooth},
                            {"triangle", sig_type_t::triangle}, {"sinc", sig_type_t::sinc}, {"box", sig_type_t::box}};
 
@@ -80,22 +82,10 @@ sig_type_t getSignal(std::string &sig)
     }
 }
 
-#else
-
-// if GCC available then just return yourself
-sig_type_t getSignal(sig_type_t &sig) { return sig; }
-
-#endif // _NVCOMPILER
-
 // input arguments
 struct fft_params_t : public argparse::Args {
-
-  // NVC++ is not supported by magic_enum
-#if !defined (__NVCOMPILER)
-  sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(box);
-#else
+  // NVC++ is not supported by magic_enum so using strings
   std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box");
-#endif // !defined (__NVCOMPILER)
 
   int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
   int& N = kwarg("N", "N-point FFT").set_default(1024);
@@ -103,12 +93,17 @@ struct fft_params_t : public argparse::Args {
   int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 
 #if defined(FFT_STDEXEC)
-  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (GPUSTDPAR)
+                          ", gpu, multigpu"
+  #endif //GPUSTDPAR
+                          "]").set_default("cpu");
 #endif  // FFT_STDEXEC
 
   bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]");
   bool& help = flag("h, help", "print help");
   bool& print_time = flag("t,time", "print fft time");
+  bool& debug = flag("d,debug", "print internal timers and launch configs");
 };
 
 inline bool isPowOf2(long long int x) {
@@ -143,9 +138,7 @@ inline int ilog2(uint32_t x)
 bool complex_compare(data_t a, data_t b, double error = 0.0101)
 {
   auto r = (fabs(a.real() - b.real()) < error)? true: false;
-  auto i = (fabs(a.imag() - b.imag()) < error)? true: false;
-
-  return (r && i);
+  return r && (fabs(a.imag() - b.imag()) < error)? true: false;
 }
 
 uint32_t reverse_bits32(uint32_t x)
@@ -281,93 +274,60 @@ class signal
     std::cout << "]" << std::endl;
   }
 
-  bool isFFT(signal &X, int threads = std::thread::hardware_concurrency())
+  bool isFFT(signal &X, scheduler auto sch, int maxN = 20000)
   {
     int N = y.size();
     bool ret = true;
 
-    data_t *Y = new data_t[N];
-    data_t * M  = new data_t[N*N];
-    auto A = std::mdspan<data_t, view_2d, std::layout_right>(M, N, N);
+    //int nparts = N/maxN;
+    //int psize = std::min(N, nparts);
+    //int matsize = psize * psize;
 
-    // scheduler from a thread pool
-    exec::static_thread_pool ctx{std::min(threads, A.extent(0))};
-    scheduler auto sch = ctx.get_scheduler();
+    std::vector<data_t> Y(N);
+    std::vector<data_t> M(N*N);
 
-    ex::sender auto test = ex::bulk(schedule(sch), A.extent(0), [&](int i){
-      for (auto j = 0; j < A.extent(1); j++){
-        A(i, j) = WNk(N, i*j);
-      }
-    })
-    // Compute fft
-    | ex::bulk(A.extent(0), [&](int i){
-      for (auto j = 0; j < A.extent(1); j++){
-        Y[i]+= A(i,j) * y[j];
-      }
-    })
-    // compare the computed fft with input
-    | ex::bulk(N, [&](int i){
-      if (!complex_compare(X[i], Y[i]))
+    auto A   = std::mdspan<data_t, view_2d, std::layout_right>(M.data(), N, N);
+    auto mdy = std::mdspan<data_t, view_2d, std::layout_right>(y.data(), N, 1);
+    auto mdY = std::mdspan<data_t, view_2d, std::layout_right>(Y.data(), N, 1);
+
+    data_t *F = M.data();
+    data_t *X_ptr = X.data();
+    data_t *Y_ptr = Y.data();
+
+    ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N*N, [=](int k, auto F){
+      int i = k / N;
+      int j = k % N;
+      F[k] = WNk(N, i*j);
+    });
+
+    // initialize
+    ex::sync_wait(init);
+
+    // compute Y[n] = dft(x[n]) = WNk * x[n]
+    stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY);
+
+    // compare the computed Y[n] (dft) with X[n](fft)
+    ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr)
+    | ex::bulk(N, [](int k, auto &ret, auto X_ptr, auto Y_ptr){
+      if (!complex_compare(X_ptr[k], Y_ptr[k]))
       {
-        std::cout << "y[" << i << "] = " << X[i] << " != WNk*x[" << i << "] = " << Y[i] << std::endl;
+        //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl;
         ret = false;
       }
+    })
+    | then([](auto ret, auto &&...)
+    {
+      return ret;
     });
 
     // let the pipeline run
-    ex::sync_wait(test);
+    auto [re] = ex::sync_wait(verify).value();
 
-    // delete the memory
-    delete[] M;
-    delete[] Y;
-
-    return ret;
+    return re;
   }
 private:
   // y[n]
   std::vector<data_t> y;
 };
 
-using sig_t = signal;
-
-//
-// serial fft function
-//
-void fft_serial(data_t *x, int lN, const int N)
-{
-    int stride = N/lN;
-
-    if (lN == 2)
-    {
-        auto x_0 = x[0] + x[1]* WNk(N, 0);
-        x[1] = x[0] - x[1]* WNk(N, 0);
-        x[0] = x_0;
-        return;
-    }
-
-    // vectors for even and odd index elements
-    std::vector<data_t> e(lN/2);
-    std::vector<data_t> o(lN/2);
-
-    // copy data into vectors
-    for (auto k = 0; k < lN/2; k++)
-    {
-        e[k] = x[2*k];
-        o[k] = x[2*k+1];
-    }
-
-    // compute N/2 pt FFT on even
-    fft_serial(e.data(), lN/2, N);
-
-    // compute N/2 pt FFT on odd
-    fft_serial(o.data(), lN/2, N);
-
-    // combine even and odd FFTs
-    for (int k = 0; k < lN/2; k++)
-    {
-        x[k] = e[k] + o[k] * WNk(N, k * stride);
-        x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
-    }
-
-    return;
-}
\ No newline at end of file
+using sig_t = signal;
\ No newline at end of file
diff --git a/apps/heat-equation/CMakeLists.txt b/apps/heat-equation/CMakeLists.txt
index c886dee..f09668f 100644
--- a/apps/heat-equation/CMakeLists.txt
+++ b/apps/heat-equation/CMakeLists.txt
@@ -5,7 +5,9 @@ file(GLOB CPP_SOURCES "*.cpp")
 foreach(source_file ${CPP_SOURCES})
   if(NOT STDPAR STREQUAL "gpu")
     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}"
-                                                    MATCHES ".*gpu.*stdpar.*")
+                                                    MATCHES ".*gpu.*stdpar.*"
+                                                  OR "${source_file}"
+                                                  MATCHES ".*cuda.*")
       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
       continue()
     endif()
diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp
index 11b6cc6..84dd968 100644
--- a/apps/heat-equation/heat-equation-stdexec.cpp
+++ b/apps/heat-equation/heat-equation-stdexec.cpp
@@ -155,12 +155,14 @@ int main(int argc, char* argv[]) {
     case sch_t::CPU:
       algorithm(exec::static_thread_pool(nthreads).get_scheduler());
       break;
+#if defined(GPUSTDPAR)
     case sch_t::GPU:
       algorithm(nvexec::stream_context().get_scheduler());
       break;
     case sch_t::MULTIGPU:
       algorithm(nvexec::multi_gpu_stream_context().get_scheduler());
       break;
+#endif // GPUSTDPAR
     default:
       throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers");
   }
diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp
index 1a5438f..17fc466 100644
--- a/apps/heat-equation/heat-equation.hpp
+++ b/apps/heat-equation/heat-equation.hpp
@@ -33,13 +33,18 @@
 #include <experimental/mdspan>
 #include <stdexec/execution.hpp>
 #include <exec/static_thread_pool.hpp>
-#include <nvexec/stream_context.cuh>
-#include <nvexec/multi_gpu_context.cuh>
+
+#if defined(GPUSTDPAR)
+  #include <nvexec/stream_context.cuh>
+  #include <nvexec/multi_gpu_context.cuh>
+using namespace nvexec;
+#endif //GPUSTDPAR
+
 #include "argparse/argparse.hpp"
 #include "commons.hpp"
 
 namespace ex = stdexec;
-using namespace nvexec;
+
 using namespace exec;
 
 // data type
@@ -73,7 +78,11 @@ struct heat_params_t : public argparse::Args {
 #endif  // HEQ_OMP || HEQ_STDEXEC
 
 #if defined(HEQ_STDEXEC)
-  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (GPUSTDPAR)
+                          ", gpu, multigpu"
+  #endif //GPUSTDPAR
+                          "]").set_default("cpu");
 #endif  // HEQ_STDEXEC
 
   Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
diff --git a/include/commons.hpp b/include/commons.hpp
index 240a443..584a11b 100644
--- a/include/commons.hpp
+++ b/include/commons.hpp
@@ -92,7 +92,11 @@ enum class sch_t { CPU, GPU, MULTIGPU };
 
 [[nodiscard]] sch_t get_sch_enum(std::string_view str) {
     static const std::map<std::string_view, sch_t> schmap = {
-        {"cpu", sch_t::CPU}, {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}};
+        {"cpu", sch_t::CPU},
+#if defined (GPUSTDPAR)
+        {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}
+#endif // GPUSTDPAR
+};
 
     if (schmap.contains(str)) {
         return schmap.at(str);
@@ -100,6 +104,10 @@ enum class sch_t { CPU, GPU, MULTIGPU };
 
     throw std::invalid_argument("FATAL: " + std::string(str) +
                                 " is not a stdexec scheduler.\n"
-                                "Available schedulers: cpu (static thread pool), gpu, multigpu.\n"
+                                "Available schedulers: cpu"
+#if defined (GPUSTDPAR)
+                                ", gpu, multigpu"
+#endif
+                                "\n"
                                 "Exiting...\n");
 }

From 99c4ce5e27469db5106689cab98144fedf84f094 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhase003@fiu.edu>
Date: Fri, 20 Oct 2023 13:13:49 -0700
Subject: [PATCH 3/3] updates for review on #44

---
 CMakeLists.txt                               |  7 ++--
 apps/1d_stencil/stencil_stdexec.cpp          | 10 +++---
 apps/fft/fft-serial.cpp                      |  8 ++---
 apps/fft/fft-stdexec.cpp                     | 38 +++++++++++++-------
 apps/fft/fft.hpp                             | 24 ++++++++-----
 apps/heat-equation/heat-equation-cuda.cpp    |  2 +-
 apps/heat-equation/heat-equation-omp.cpp     |  2 +-
 apps/heat-equation/heat-equation-serial.cpp  |  2 +-
 apps/heat-equation/heat-equation-stdexec.cpp |  6 ++--
 apps/heat-equation/heat-equation-stdpar.cpp  |  2 +-
 apps/heat-equation/heat-equation.hpp         |  8 ++---
 include/commons.hpp                          |  6 ++--
 12 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a43330..810dad1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,14 +171,13 @@ endif()
 
 # need to add appropriate flags for stdexec
 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} -pthread"
-)
+    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}")
 
 # add -cudalib=cublas if -stdpar=gpu
 if (STDPAR STREQUAL "gpu")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPUSTDPAR")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU")
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UGPUSTDPAR")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU")
 endif()
 
 # ##############################################################################
diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp
index e2610a1..4078761 100644
--- a/apps/1d_stencil/stencil_stdexec.cpp
+++ b/apps/1d_stencil/stencil_stdexec.cpp
@@ -27,7 +27,7 @@
 //
 // This example provides a stdexec implementation for the 1D stencil code.
 #include <exec/static_thread_pool.hpp>
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
 #include <nvexec/multi_gpu_context.cuh>
 #include <nvexec/stream_context.cuh>
 #endif
@@ -48,9 +48,9 @@ struct args_params_t : public argparse::Args {
     bool& help = flag("h, help", "print help");
     bool& time = kwarg("t, time", "print time").set_default(true);
     std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
-  #if defined (GPUSTDPAR)
+  #if defined (USE_GPU)
                             ", gpu, multigpu"
-  #endif //GPUSTDPAR
+  #endif //USE_GPU
                             "]").set_default("cpu");
 
     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
@@ -128,14 +128,14 @@ int benchmark(args_params_t const& args) {
             case sch_t::CPU:
                 solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt);
                 break;
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
             case sch_t::GPU:
                 solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt);
                 break;
             case sch_t::MULTIGPU:
                 solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt);
                 break;
-#endif // GPUSTDPAR
+#endif // USE_GPU
             default:
                 std::cerr << "Unknown scheduler type encountered." << std::endl;
                 break;
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index 7400023..f970bfa 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -33,7 +33,7 @@
 //
 // serial fft function
 //
-std::vector<data_t> fft_serial(data_t *x, const int N, bool debug = false)
+[[nodiscard]] std::vector<data_t> fft_serial(const data_t *x, const int N, bool debug = false)
 {
     std::vector<data_t> x_r(N);
     std::vector<uint32_t> id(N);
@@ -97,7 +97,7 @@ std::vector<data_t> fft_serial(data_t *x, const int N, bool debug = false)
 int main(int argc, char* argv[])
 {
     // parse params
-    fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
+    const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 
     // see if help wanted
     if (args.help)
@@ -139,10 +139,8 @@ int main(int argc, char* argv[])
     Timer timer;
 
     // fft radix-2 algorithm
-    auto &&y = fft_serial(x_n.data(), N, args.debug);
-
     // y[n] = fft(x[n]);
-    sig_t y_n(y);
+    sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug)));
 
     // stop timer
     auto elapsed = timer.stop();
diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp
index a725f6b..9a3e7fc 100644
--- a/apps/fft/fft-stdexec.cpp
+++ b/apps/fft/fft-stdexec.cpp
@@ -34,7 +34,7 @@
 //
 // fft algorithm
 //
-std::vector<data_t> fft(data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false)
+[[nodiscard]] std::vector<data_t> fft(const data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false)
 {
     std::vector<data_t> x_rev(N);
     std::vector<uint32_t> ind(N);
@@ -42,19 +42,20 @@ std::vector<data_t> fft(data_t *x, scheduler auto sch, const int N, const int ma
     data_t *x_r = x_rev.data();
     uint32_t *id = ind.data();
 
+    // compute shift factor
     int shift = 32 - ilog2(N);
 
+    // twiddle bits for fft
     ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id)
         | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){
             id[k] = reverse_bits32(k) >> shift;
             x_r[k] = x[id[k]];
-        })
-        | ex::then([](auto &&...){});
-
-    ex::sync_wait(twiddle);
+        });
+    ex::sync_wait(std::move(twiddle));
 
     // niterations
     int niters = ilog2(N);
+
     // local merge partition size
     int lN = 2;
 
@@ -65,41 +66,52 @@ std::vector<data_t> fft(data_t *x, scheduler auto sch, const int N, const int ma
     // transfer_just sender
     ex::sender auto tx = ex::transfer_just(sch, x_r);
 
-    for (int k = 0; k < niters; k++, lN*=2)
+    // iterate until niters - lN*=2 after each iteration
+    for (int it = 0; it < niters; it++, lN*=2)
     {
-        std::cout << (100.0 * k)/niters << "%.." << std::flush;
+        // print progress
+        std::cout << (100.0 * it)/niters << "%.." << std::flush;
 
+        // debugging timer
         static Timer dtimer;
 
         // number of partitions
         int nparts = N/lN;
         int tpp = lN/2;
 
+        // display info only if debugging
         if (debug)
         {
             dtimer.start();
             std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl;
         }
 
+        // parallel compute lN-pt FFT
         ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y)
         {
             // compute indices
             int  e   = (k/tpp)*lN + (k % tpp);
             auto o   = e + tpp;
             auto i   = (k % tpp);
+
+            // compute 2-pt DFT
             auto tmp = y[e] + y[o] * WNk(N, i * nparts);
             y[o]     = y[e] - y[o] * WNk(N, i * nparts);
             y[e]     = tmp;
         });
 
+        // wait for pipeline
         ex::sync_wait(std::move(merge));
 
+        // print only if debugging
         if (debug)
             std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl;
     }
 
+    // print final progress mark
     std::cout << "100%" << std::endl;
 
+    // return x_rev = fft(x_r)
     return x_rev;
 }
 
@@ -109,7 +121,7 @@ std::vector<data_t> fft(data_t *x, scheduler auto sch, const int N, const int ma
 int main(int argc, char* argv[])
 {
     // parse params
-    fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
+    const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 
     // see if help wanted
     if (args.help)
@@ -147,7 +159,7 @@ int main(int argc, char* argv[])
     }
 
     // y[n] = fft(x[n]);
-    std::vector<data_t> y;
+    std::vector<data_t> y(N);
 
     // start the timer here
     Timer timer;
@@ -160,14 +172,14 @@ int main(int argc, char* argv[])
         case sch_t::CPU:
             y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug);
             break;
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
         case sch_t::GPU:
             y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug);
             break;
         case sch_t::MULTIGPU:
             y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug);
             break;
-#endif // GPUSTDPAR
+#endif // USE_GPU
         default:
             throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
   }
@@ -199,14 +211,14 @@ int main(int argc, char* argv[])
             case sch_t::CPU:
                 verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler());
                 break;
-#if defined (GPUSTDPAR)
+#if defined (USE_GPU)
             case sch_t::GPU:
                 verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
                 break;
             case sch_t::MULTIGPU:
                 verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
                 break;
-#endif // GPUSTDPAR
+#endif // USE_GPU
             default:
                 throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
         }
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 73280c5..5f4e6ea 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -36,11 +36,11 @@
 #include <stdexec/execution.hpp>
 #include <exec/static_thread_pool.hpp>
 
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
   #include <nvexec/stream_context.cuh>
   #include <nvexec/multi_gpu_context.cuh>
 using namespace nvexec;
-#endif //GPUSTDPAR
+#endif //USE_GPU
 
 #include <experimental/linalg>
 #include "argparse/argparse.hpp"
@@ -94,9 +94,9 @@ struct fft_params_t : public argparse::Args {
 
 #if defined(FFT_STDEXEC)
   std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
-  #if defined (GPUSTDPAR)
+  #if defined (USE_GPU)
                           ", gpu, multigpu"
-  #endif //GPUSTDPAR
+  #endif //USE_GPU
                           "]").set_default("cpu");
 #endif  // FFT_STDEXEC
 
@@ -171,6 +171,11 @@ class signal
     y = rhs.y;
   }
 
+  signal(std::vector<data_t> &&in)
+  {
+    y = std::move(in);
+  }
+
   signal(std::vector<data_t> &in)
   {
     y = std::move(in);
@@ -274,14 +279,15 @@ class signal
     std::cout << "]" << std::endl;
   }
 
-  bool isFFT(signal &X, scheduler auto sch, int maxN = 20000)
+  [[nodiscard]] bool isFFT(signal &X, scheduler auto sch, int maxN = 20000)
   {
     int N = y.size();
     bool ret = true;
 
-    //int nparts = N/maxN;
-    //int psize = std::min(N, nparts);
-    //int matsize = psize * psize;
+    if (X.len() > maxN)
+    {
+      std::cout << "Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected.." << std::endl;
+    }
 
     std::vector<data_t> Y(N);
     std::vector<data_t> M(N*N);
@@ -330,4 +336,4 @@ class signal
   std::vector<data_t> y;
 };
 
-using sig_t = signal;
\ No newline at end of file
+using sig_t = signal;
diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp
index 3ea2988..cfde2c5 100644
--- a/apps/heat-equation/heat-equation-cuda.cpp
+++ b/apps/heat-equation/heat-equation-cuda.cpp
@@ -157,7 +157,7 @@ __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) {
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp
index 6af69b0..696ed87 100644
--- a/apps/heat-equation/heat-equation-omp.cpp
+++ b/apps/heat-equation/heat-equation-omp.cpp
@@ -52,7 +52,7 @@ void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1,
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-serial.cpp b/apps/heat-equation/heat-equation-serial.cpp
index 1ae243b..6e07338 100644
--- a/apps/heat-equation/heat-equation-serial.cpp
+++ b/apps/heat-equation/heat-equation-serial.cpp
@@ -56,7 +56,7 @@ void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) {
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp
index 84dd968..7b45db6 100644
--- a/apps/heat-equation/heat-equation-stdexec.cpp
+++ b/apps/heat-equation/heat-equation-stdexec.cpp
@@ -36,7 +36,7 @@
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
@@ -155,14 +155,14 @@ int main(int argc, char* argv[]) {
     case sch_t::CPU:
       algorithm(exec::static_thread_pool(nthreads).get_scheduler());
       break;
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
     case sch_t::GPU:
       algorithm(nvexec::stream_context().get_scheduler());
       break;
     case sch_t::MULTIGPU:
       algorithm(nvexec::multi_gpu_stream_context().get_scheduler());
       break;
-#endif // GPUSTDPAR
+#endif // USE_GPU
     default:
       throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers");
   }
diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp
index b20fb68..ed65ddc 100644
--- a/apps/heat-equation/heat-equation-stdpar.cpp
+++ b/apps/heat-equation/heat-equation-stdpar.cpp
@@ -35,7 +35,7 @@
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp
index 17fc466..abbf0e0 100644
--- a/apps/heat-equation/heat-equation.hpp
+++ b/apps/heat-equation/heat-equation.hpp
@@ -34,11 +34,11 @@
 #include <stdexec/execution.hpp>
 #include <exec/static_thread_pool.hpp>
 
-#if defined(GPUSTDPAR)
+#if defined(USE_GPU)
   #include <nvexec/stream_context.cuh>
   #include <nvexec/multi_gpu_context.cuh>
 using namespace nvexec;
-#endif //GPUSTDPAR
+#endif //USE_GPU
 
 #include "argparse/argparse.hpp"
 #include "commons.hpp"
@@ -79,9 +79,9 @@ struct heat_params_t : public argparse::Args {
 
 #if defined(HEQ_STDEXEC)
   std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
-  #if defined (GPUSTDPAR)
+  #if defined (USE_GPU)
                           ", gpu, multigpu"
-  #endif //GPUSTDPAR
+  #endif //USE_GPU
                           "]").set_default("cpu");
 #endif  // HEQ_STDEXEC
 
diff --git a/include/commons.hpp b/include/commons.hpp
index 584a11b..d917b42 100644
--- a/include/commons.hpp
+++ b/include/commons.hpp
@@ -93,9 +93,9 @@ enum class sch_t { CPU, GPU, MULTIGPU };
 [[nodiscard]] sch_t get_sch_enum(std::string_view str) {
     static const std::map<std::string_view, sch_t> schmap = {
         {"cpu", sch_t::CPU},
-#if defined (GPUSTDPAR)
+#if defined (USE_GPU)
         {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}
-#endif // GPUSTDPAR
+#endif // USE_GPU
 };
 
     if (schmap.contains(str)) {
@@ -105,7 +105,7 @@ enum class sch_t { CPU, GPU, MULTIGPU };
     throw std::invalid_argument("FATAL: " + std::string(str) +
                                 " is not a stdexec scheduler.\n"
                                 "Available schedulers: cpu"
-#if defined (GPUSTDPAR)
+#if defined (USE_GPU)
                                 ", gpu, multigpu"
 #endif
                                 "\n"