diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c6ee0c..810dad1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,8 +171,14 @@ endif()
 
 # need to add appropriate flags for stdexec
 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread"
-)
+    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}")
+
+# add -cudalib=cublas if -stdpar=gpu
+if (STDPAR STREQUAL "gpu")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU")
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU")
+endif()
 
 # ##############################################################################
 # Add sub-directories
diff --git a/apps/1d_stencil/CMakeLists.txt b/apps/1d_stencil/CMakeLists.txt
index c5157d2..93f4fc2 100644
--- a/apps/1d_stencil/CMakeLists.txt
+++ b/apps/1d_stencil/CMakeLists.txt
@@ -13,7 +13,6 @@ target_include_directories(
   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
-if("${STDPAR}" STREQUAL "gpu")
   # TODO, fix cmake
   add_executable(stencil_stdexec stencil_stdexec.cpp)
   target_link_libraries(stencil_stdexec stdexec)
@@ -22,6 +21,7 @@ if("${STDPAR}" STREQUAL "gpu")
     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 
+if("${STDPAR}" STREQUAL "gpu")
   add_executable(stencil_cuda stencil_cuda.cpp)
   target_include_directories(
     stencil_cuda
diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp
index ace96bc..4078761 100644
--- a/apps/1d_stencil/stencil_stdexec.cpp
+++ b/apps/1d_stencil/stencil_stdexec.cpp
@@ -1,7 +1,7 @@
 /*
  * MIT License
  *
- * Copyright (c) 2023 Weile Wei 
+ * Copyright (c) 2023 Weile Wei
  * Copyright (c) 2023 The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  * required approvals from the U.S. Dept. of Energy).All rights reserved.
@@ -27,8 +27,10 @@
 //
 // This example provides a stdexec implementation for the 1D stencil code.
 #include <exec/static_thread_pool.hpp>
+#if defined(USE_GPU)
 #include <nvexec/multi_gpu_context.cuh>
 #include <nvexec/stream_context.cuh>
+#endif
 #include <stdexec/execution.hpp>
 
 #include "argparse/argparse.hpp"
@@ -45,7 +47,12 @@ struct args_params_t : public argparse::Args {
     bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false);
     bool& help = flag("h, help", "print help");
     bool& time = kwarg("t, time", "print time").set_default(true);
-    std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+    std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (USE_GPU)
+                            ", gpu, multigpu"
+  #endif //USE_GPU
+                            "]").set_default("cpu");
+
     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 };
 
@@ -121,12 +128,14 @@ int benchmark(args_params_t const& args) {
             case sch_t::CPU:
                 solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt);
                 break;
+#if defined(USE_GPU)
             case sch_t::GPU:
                 solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt);
                 break;
             case sch_t::MULTIGPU:
                 solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt);
                 break;
+#endif // USE_GPU
             default:
                 std::cerr << "Unknown scheduler type encountered." << std::endl;
                 break;
diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp
index 1550094..1377745 100644
--- a/apps/comm-study/comm-study-no-senders.cpp
+++ b/apps/comm-study/comm-study-no-senders.cpp
@@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) {
 
   // get sum(Y) - one last memcpy (not USM) D2H
   sum +=
-      std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>(), [](T &val){return val * val;});
+      std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>());
 
   return sum / N;
 }
diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp
deleted file mode 100644
index 7629ce0..0000000
--- a/apps/comm-study/comm-study.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2023 The Regents of the University of California,
- * through Lawrence Berkeley National Laboratory (subject to receipt of any
- * required approvals from the U.S. Dept. of Energy).  All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "commons.hpp"
-#include "exec/static_thread_pool.hpp"
-
-using namespace std;
-using namespace stdexec;
-using stdexec::sync_wait;
-
-using T = double;
-using time_point_t = std::chrono::system_clock::time_point;
-
-// must take in the pointers/vectors by reference
-template <typename P>
-auto work(P& A, P& B, P& Y, int N) {
-  T sum = 0.0;
-
-  // init A and B separately - will it cause an H2D copy?
-  sender auto s1 = then(just(),
-                        [&] {
-                          std::for_each(std::execution::par_unseq, &A[0], &A[N],
-                                        [&](T& ai) { ai = cos(M_PI / 4); });
-                        })
-                   // trigger a D2H here
-                   | then([&] {
-                       for (int i = 0; i < N / 3; i++) {
-                         // read only or read-write operations
-                         sum += A[i] / N;
-
-                         // this line if commented should not result in an H2D
-                         // after this but it does.
-                         // A[i] = sin(M_PI/4);
-                       }
-                       std::cout << std::endl;
-                     });
-
-  // will it cause an H2D here?
-  sender auto s2 = then(just(), [&] {
-    std::for_each(std::execution::par_unseq, &B[0], &B[N],
-                  [&](T& bi) { bi = sin(M_PI / 6); });
-  });
-
-  // will s1 and s2 execute in parallel or not?
-  sync_wait(when_all(std::move(s1), std::move(s2)));
-
-  // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
-  sender auto s3 =
-      then(just(),
-           [&] {
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &A[0], [&](T& ai, T& bi) { return ai + bi; });
-             std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0],
-                            &Y[0], [&](T& ai, T& bi) {
-                              return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi);
-                            });
-           })
-      // should trigger a D2H copy of N/3 elements
-      | then([&] {
-          for (int i = 0; i < N / 3; i++)
-            sum += Y[i] / N;
-
-          std::cout << std::endl;
-        })
-      // get sum(Y) - wonder if there is another H2D as we only read it in the
-      // last step
-      | then([&] {
-          return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0,
-                             std::plus<T>());
-        });
-
-  auto [val] = sync_wait(s3).value();
-
-  return sum += val;
-}
-
-int main(int argc, char* argv[]) {
-  constexpr int N = 1e9;
-  time_point_t mark = std::chrono::system_clock::now();
-  auto es =
-      std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-          .count();
-  T sum = 0.0;
-
-#if 1  // 0 if only arrays
-  std::vector<T> A(N);
-  std::vector<T> B(N);
-  std::vector<T> Y(N);
-
-  mark = std::chrono::system_clock::now();
-  sum = work(A, B, Y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl;
-
-  std::cout << fixed << "sum: " << sum << "\n";
-#endif
-
-#if 1  // 0 if only vectors
-
-  // allocate memory - can we just allocate it on device only?
-  T* a = new T[N];
-  T* b = new T[N];
-  T* y = new T[N];
-
-  sum = 0;
-  mark = std::chrono::system_clock::now();
-  sum = work(a, b, y, N);
-  es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark)
-           .count();
-  std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl
-            << std::endl;
-
-  // do not use scientific notation
-  std::cout << fixed << "sum: " << sum << "\n";
-#endif
-
-  return 0;
-}
\ No newline at end of file
diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt
index 25b80ce..10e89c1 100644
--- a/apps/fft/CMakeLists.txt
+++ b/apps/fft/CMakeLists.txt
@@ -2,6 +2,11 @@ project(fft LANGUAGES CXX)
 
 file(GLOB CPP_SOURCES "*.cpp")
 
+# add -cudalib=cublas if -stdpar=gpu
+if (STDPAR STREQUAL "gpu")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas")
+endif()
+
 foreach(source_file ${CPP_SOURCES})
   if(NOT STDPAR STREQUAL "gpu")
     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}"
@@ -18,16 +23,19 @@ foreach(source_file ${CPP_SOURCES})
   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
 
   # add dependency on argparse
-  add_dependencies(${exec_name} argparse magic_enum)
+  add_dependencies(${exec_name} argparse)
 
   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
                                                         LINKER_LANGUAGE CXX)
   target_include_directories(
     ${exec_name}
     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
-            ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+            ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+  # uncomment only if using nvc++/23.1 - no need if nvc++/23.7
+  # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64)
 
-  target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec)
+  target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas)
 
   set_target_properties(
     ${exec_name}
diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp
index 38082e4..f970bfa 100644
--- a/apps/fft/fft-serial.cpp
+++ b/apps/fft/fft-serial.cpp
@@ -30,13 +30,74 @@
 
 #include "fft.hpp"
 
+//
+// serial fft function
+//
+[[nodiscard]] std::vector<data_t> fft_serial(const data_t *x, const int N, bool debug = false)
+{
+    std::vector<data_t> x_r(N);
+    std::vector<uint32_t> id(N);
+
+    // bit shift
+    int shift = 32 - ilog2(N);
+
+    // twiddle data in x[n]
+    for (int k = 0; k < N; k++)
+    {
+        id[k] = reverse_bits32(k) >> shift;
+        x_r[k] = x[id[k]];
+    }
+
+    // niterations
+    int niters = ilog2(N);
+    // local merge partition size
+    int lN = 2;
+
+    // set cout precision
+    std::cout << std::fixed << std::setprecision(1);
+
+    std::cout << "FFT progress: ";
+
+    for (int k = 0; k < niters; k++, lN*=2)
+    {
+        std::cout << (100.0 * k)/niters << "%.." << std::flush;
+
+        static Timer dtimer;
+
+        // number of partitions
+        int nparts = N/lN;
+        int tpp = lN/2;
+
+        if (debug)
+            dtimer.start();
+
+        // merge
+        for (int k = 0; k < N/2; k++)
+        {
+            // compute indices
+            int  e   = (k/tpp)*lN + (k % tpp);
+            auto o   = e + tpp;
+            auto i   = (k % tpp);
+            auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts);
+            x_r[o]     = x_r[e] - x_r[o] * WNk(N, i * nparts);
+            x_r[e]     = tmp;
+        }
+
+        if (debug)
+        std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl;
+    }
+
+    std::cout << "100%" << std::endl;
+    return x_r;
+}
+
 //
 // simulation
 //
 int main(int argc, char* argv[])
 {
     // parse params
-    fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
+    const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 
     // see if help wanted
     if (args.help)
@@ -64,9 +125,6 @@ int main(int argc, char* argv[])
         x_n.resize(N);
     }
 
-    // y[n] = fft(x[n]);
-    sig_t y_n(x_n);
-
     if (print_sig)
     {
         std::cout << std::endl << "x[n] = ";
@@ -81,7 +139,8 @@ int main(int argc, char* argv[])
     Timer timer;
 
     // fft radix-2 algorithm
-    fft_serial(y_n.data(), N, N);
+    // y[n] = fft(x[n]);
+    sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug)));
 
     // stop timer
     auto elapsed = timer.stop();
@@ -101,7 +160,7 @@ int main(int argc, char* argv[])
     // validate the recursively computed fft
     if (validate)
     {
-        if (x_n.isFFT(y_n))
+        if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler()))
             std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl;
         else
             std::cout << "FAILED: y[n] != fft(x[n])" << std::endl;
diff --git a/apps/fft/fft-stdexec-multicore.cpp b/apps/fft/fft-stdexec-multicore.cpp
deleted file mode 100644
index 045bb6a..0000000
--- a/apps/fft/fft-stdexec-multicore.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * MIT License
- *
- * Copyright (c) 2023 The Regents of the University of California,
- * through Lawrence Berkeley National Laboratory (subject to receipt of any
- * required approvals from the U.S. Dept. of Energy).All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/*
- * commons for the fft codes
- */
-
-#include "fft.hpp"
-
-using any_void_sender =
-      any_sender_of<stdexec::set_value_t(), stdexec::set_stopped_t(),
-                    stdexec::set_error_t(std::exception_ptr)>;
-
-//
-// recursive multicore fft
-//
-any_void_sender fft_multicore(sender auto &&snd, data_t *x, int lN, const int N, int max_threads)
-{
-    // current merge stride
-    int stride = N/lN;
-
-    // to check parallelism
-    //std::cout << "lN = " << lN << ", from tid: " << std::this_thread::get_id() << std::endl;
-
-    // if parallelism > max threads => serial
-    if (stride >= max_threads)
-    {
-        // TODO: can this be improved? Putting it in ex::then doesn't sync
-        fft_serial(x, lN, N);
-        return just();
-    }
-
-    // base case
-    if (lN == 2)
-    {
-        // TODO: can this be improved? Putting it in ex::then doesn't sync
-        auto x_0 = x[0] + x[1]* WNk(N, 0);
-        x[1] = x[0] - x[1]* WNk(N, 0);
-        x[0] = x_0;
-
-        return just();
-    }
-
-    // vectors for even and odd index elements
-    std::vector<data_t> e(lN/2);
-    std::vector<data_t> o(lN/2);
-
-    // array to use in bulk
-    std::array<data_t *, 2> dat{e.data(), o.data()};
-
-    // local thread pool and scheduler
-    exec::static_thread_pool pool_loc{std::min(lN/2, max_threads)};
-    ex::sender auto snd_loc = schedule(pool_loc.get_scheduler());
-
-    // copy even and odd indexes to vectors and split sender
-    ex::sender auto merge =
-        ex::bulk(snd, lN/2, [&](int k){
-            // copy data into vectors
-            e[k] = x[2*k];
-            o[k] = x[2*k+1];
-        })
-        | ex::bulk(2, [=,&dat](int k){
-            // NVC++ 23.1: passing `snd` here results in (nvc++-Fatal-/path/to/tools/cpp1 TERMINATED by signal 11)
-            // NVC++ 23.7 goes in forever loop
-
-            // compute N/2 pt FFT on even and odd in bulk
-            fft_multicore(snd_loc, dat[k], lN/2, N, max_threads);
-        })
-        | ex::bulk(lN/2, [&](int k){
-            // combine even and odd FFTs
-            x[k] = e[k] + o[k] * WNk(N, k * stride);
-            x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
-        });
-
-    // wait to complete
-    ex::sync_wait(std::move(merge));
-
-    // return void sender
-    return just();
-}
-
-//
-// simulation
-//
-int main(int argc, char* argv[])
-{
-    // parse params
-    fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
-
-    // see if help wanted
-    if (args.help)
-    {
-        args.print();  // prints all variables
-        return 0;
-    }
-
-    // simulation variables
-    int N = args.N;
-    sig_type_t sig_type = sig_type_t::box;
-    int max_threads = args.max_threads;
-    //int freq = args.freq;
-    bool print_sig = args.print_sig;
-    bool print_time = args.print_time;
-    bool validate = args.validate;
-
-    // x[n] signal
-    sig_t x_n(N, sig_type);
-
-    if (!isPowOf2(N))
-    {
-        N = ceilPowOf2(N);
-        std::cout << "INFO: N is not a power of 2. Padding zeros => N = " << N << std::endl;
-
-        x_n.resize(N);
-    }
-
-    // y[n] = fft(x[n]);
-    sig_t y_n(x_n);
-
-    if (print_sig)
-    {
-        std::cout << std::endl << "x[n] = ";
-        x_n.printSignal();
-        std::cout << std::endl;
-    }
-
-    // niterations
-    int niters = ilog2(N);
-
-    // thread pool and scheduler
-    exec::static_thread_pool pool{max_threads};
-    scheduler auto sched = pool.get_scheduler();
-
-    // start the timer here
-    Timer timer;
-
-    // fft radix-2 algorithm
-    fft_multicore(schedule(sched), y_n.data(), N, N, max_threads);
-
-    // stop timer
-    auto elapsed = timer.stop();
-
-    // print the fft(x)
-    if (print_sig)
-    {
-        std::cout << "X(k) = ";
-        y_n.printSignal();
-        std::cout << std::endl;
-    }
-
-    // print the computation time
-    if (print_time)
-        std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl;
-
-    // validate the recursively computed fft
-    if (validate)
-    {
-        if (x_n.isFFT(y_n))
-            std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl;
-        else
-            std::cout << "FAILED: y[n] != fft(x[n])" << std::endl;
-    }
-
-    return 0;
-}
diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp
new file mode 100644
index 0000000..9a3e7fc
--- /dev/null
+++ b/apps/fft/fft-stdexec.cpp
@@ -0,0 +1,233 @@
+/*
+ * MIT License
+ *
+ * Copyright (c) 2023 The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of any
+ * required approvals from the U.S. Dept. of Energy).All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * commons for the fft codes
+ */
+
+#define FFT_STDEXEC
+#include "fft.hpp"
+
+//
+// fft algorithm
+//
+[[nodiscard]] std::vector<data_t> fft(const data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false)
+{
+    std::vector<data_t> x_rev(N);
+    std::vector<uint32_t> ind(N);
+
+    data_t *x_r = x_rev.data();
+    uint32_t *id = ind.data();
+
+    // compute shift factor
+    int shift = 32 - ilog2(N);
+
+    // twiddle bits for fft
+    ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id)
+        | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){
+            id[k] = reverse_bits32(k) >> shift;
+            x_r[k] = x[id[k]];
+        });
+    ex::sync_wait(std::move(twiddle));
+
+    // niterations
+    int niters = ilog2(N);
+
+    // local merge partition size
+    int lN = 2;
+
+    // set cout precision
+    std::cout << std::fixed << std::setprecision(1);
+    std::cout << "FFT progress: ";
+
+    // transfer_just sender
+    ex::sender auto tx = ex::transfer_just(sch, x_r);
+
+    // iterate until niters - lN*=2 after each iteration
+    for (int it = 0; it < niters; it++, lN*=2)
+    {
+        // print progress
+        std::cout << (100.0 * it)/niters << "%.." << std::flush;
+
+        // debugging timer
+        static Timer dtimer;
+
+        // number of partitions
+        int nparts = N/lN;
+        int tpp = lN/2;
+
+        // display info only if debugging
+        if (debug)
+        {
+            dtimer.start();
+            std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl;
+        }
+
+        // parallel compute lN-pt FFT
+        ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y)
+        {
+            // compute indices
+            int  e   = (k/tpp)*lN + (k % tpp);
+            auto o   = e + tpp;
+            auto i   = (k % tpp);
+
+            // compute 2-pt DFT
+            auto tmp = y[e] + y[o] * WNk(N, i * nparts);
+            y[o]     = y[e] - y[o] * WNk(N, i * nparts);
+            y[e]     = tmp;
+        });
+
+        // wait for pipeline
+        ex::sync_wait(std::move(merge));
+
+        // print only if debugging
+        if (debug)
+            std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl;
+    }
+
+    // print final progress mark
+    std::cout << "100%" << std::endl;
+
+    // return x_rev = fft(x_r)
+    return x_rev;
+}
+
+//
+// simulation
+//
+int main(int argc, char* argv[])
+{
+    // parse params
+    const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
+
+    // see if help wanted
+    if (args.help)
+    {
+        args.print();  // prints all variables
+        return 0;
+    }
+
+    // simulation variables
+    int N = args.N;
+    sig_type_t sig_type = sig_type_t::box;
+    int max_threads = args.max_threads;
+    //int freq = args.freq;
+    bool print_sig = args.print_sig;
+    bool print_time = args.print_time;
+    bool validate = args.validate;
+    std::string sched = args.sch;
+
+    // x[n] signal
+    sig_t x_n(N, sig_type);
+
+    if (!isPowOf2(N))
+    {
+        N = ceilPowOf2(N);
+        std::cout << "INFO: N is not a power of 2. Padding zeros => N = " << N << std::endl;
+
+        x_n.resize(N);
+    }
+
+    if (print_sig)
+    {
+        std::cout << std::endl << "x[n] = ";
+        x_n.printSignal();
+        std::cout << std::endl;
+    }
+
+    // y[n] = fft(x[n]);
+    std::vector<data_t> y(N);
+
+    // start the timer here
+    Timer timer;
+
+    // initialize stdexec scheduler
+    sch_t scheduler = get_sch_enum(sched);
+
+    // launch with appropriate stdexec scheduler
+    switch (scheduler) {
+        case sch_t::CPU:
+            y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug);
+            break;
+#if defined(USE_GPU)
+        case sch_t::GPU:
+            y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug);
+            break;
+        case sch_t::MULTIGPU:
+            y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug);
+            break;
+#endif // USE_GPU
+        default:
+            throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
+  }
+
+    // y[n] = fft(x[n])
+    sig_t y_n(y);
+
+    // stop timer
+    auto elapsed = timer.stop();
+
+    // print the fft(x)
+    if (print_sig)
+    {
+        std::cout << "X(k) = ";
+        y_n.printSignal();
+        std::cout << std::endl;
+    }
+
+    // print the computation time
+    if (print_time)
+        std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl;
+
+    // validate the recursively computed fft
+    if (validate)
+    {
+        bool verify = true;
+            // launch with appropriate stdexec scheduler
+        switch (scheduler) {
+            case sch_t::CPU:
+                verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler());
+                break;
+#if defined (USE_GPU)
+            case sch_t::GPU:
+                verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
+                break;
+            case sch_t::MULTIGPU:
+                verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
+                break;
+#endif // USE_GPU
+            default:
+                throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
+        }
+
+        if (verify)
+            std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl;
+        else
+            std::cout << "FAILED: y[n] != fft(x[n])" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp
index 1700a30..5f4e6ea 100644
--- a/apps/fft/fft.hpp
+++ b/apps/fft/fft.hpp
@@ -34,9 +34,17 @@
 
 #include <experimental/mdspan>
 #include <stdexec/execution.hpp>
-#include <exec/any_sender_of.hpp>
-#include "exec/static_thread_pool.hpp"
+#include <exec/static_thread_pool.hpp>
+
+#if defined(USE_GPU)
+  #include <nvexec/stream_context.cuh>
+  #include <nvexec/multi_gpu_context.cuh>
+using namespace nvexec;
+#endif //USE_GPU
+
+#include <experimental/linalg>
 #include "argparse/argparse.hpp"
+
 #include "commons.hpp"
 
 using namespace std;
@@ -46,10 +54,6 @@ using stdexec::sync_wait;
 
 namespace ex = stdexec;
 
-template <class... Ts>
-using any_sender_of = typename exec::any_receiver_ref<
-    stdexec::completion_signatures<Ts...>>::template any_sender<>;
-
 // 2D view
 using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 
@@ -61,7 +65,7 @@ using data_t = std::complex<Real_t>;
 enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box };
 using sig_type_t = sig_type;
 
-#if defined (__NVCOMPILER)
+// map for signals
 std::map<std::string, sig_type_t> sigmap{{"square",sig_type_t::square}, {"sinusoid", sig_type_t::sinusoid}, {"triangle", sig_type_t::sawtooth},
                            {"triangle", sig_type_t::triangle}, {"sinc", sig_type_t::sinc}, {"box", sig_type_t::box}};
 
@@ -78,31 +82,28 @@ sig_type_t getSignal(std::string &sig)
     }
 }
 
-#else
-
-// if GCC available then just return yourself
-sig_type_t getSignal(sig_type_t &sig) { return sig; }
-
-#endif // _NVCOMPILER
-
 // input arguments
 struct fft_params_t : public argparse::Args {
-
-  // NVC++ is not supported by magic_enum
-#if !defined (__NVCOMPILER)
-  sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(box);
-#else
+  // NVC++ is not supported by magic_enum so using strings
   std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box");
-#endif // !defined (__NVCOMPILER)
 
   int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
   int& N = kwarg("N", "N-point FFT").set_default(1024);
   bool& print_sig = flag("p,print", "print x[n] and X(k)");
   int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 
+#if defined(FFT_STDEXEC)
+  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (USE_GPU)
+                          ", gpu, multigpu"
+  #endif //USE_GPU
+                          "]").set_default("cpu");
+#endif  // FFT_STDEXEC
+
   bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]");
   bool& help = flag("h, help", "print help");
   bool& print_time = flag("t,time", "print fft time");
+  bool& debug = flag("d,debug", "print internal timers and launch configs");
 };
 
 inline bool isPowOf2(long long int x) {
@@ -137,9 +138,16 @@ inline int ilog2(uint32_t x)
 bool complex_compare(data_t a, data_t b, double error = 0.0101)
 {
   auto r = (fabs(a.real() - b.real()) < error)? true: false;
-  auto i = (fabs(a.imag() - b.imag()) < error)? true: false;
+  return r && (fabs(a.imag() - b.imag()) < error)? true: false;
+}
 
-  return (r && i);
+uint32_t reverse_bits32(uint32_t x)
+{
+    x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1);
+    x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2);
+    x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4);
+    x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8);
+    return (x >> 16) | (x << 16);
 }
 
 class signal
@@ -163,6 +171,11 @@ class signal
     y = rhs.y;
   }
 
+  signal(std::vector<data_t> &&in)
+  {
+    y = std::move(in);
+  }
+
   signal(std::vector<data_t> &in)
   {
     y = std::move(in);
@@ -266,47 +279,57 @@ class signal
     std::cout << "]" << std::endl;
   }
 
-  bool isFFT(signal &X, int threads = std::thread::hardware_concurrency())
+  [[nodiscard]] bool isFFT(signal &X, scheduler auto sch, int maxN = 20000)
   {
     int N = y.size();
     bool ret = true;
 
-    data_t *Y = new data_t[N];
-    data_t * M  = new data_t[N*N];
-    auto A = std::mdspan<data_t, view_2d, std::layout_right>(M, N, N);
+    if (X.len() > maxN)
+    {
+      std::cout << "Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected.." << std::endl;
+    }
 
-    // scheduler from a thread pool
-    exec::static_thread_pool ctx{std::min(threads, A.extent(0))};
-    scheduler auto sch = ctx.get_scheduler();
+    std::vector<data_t> Y(N);
+    std::vector<data_t> M(N*N);
 
-    ex::sender auto test = ex::bulk(schedule(sch), A.extent(0), [&](int i){
-      for (auto j = 0; j < A.extent(1); j++){
-        A(i, j) = WNk(N, i*j);
-      }
-    })
-    // Compute fft
-    | ex::bulk(A.extent(0), [&](int i){
-      for (auto j = 0; j < A.extent(1); j++){
-        Y[i]+= A(i,j) * y[j];
-      }
-    })
-    // compare the computed fft with input
-    | ex::bulk(N, [&](int i){
-      if (!complex_compare(X[i], Y[i]))
+    auto A   = std::mdspan<data_t, view_2d, std::layout_right>(M.data(), N, N);
+    auto mdy = std::mdspan<data_t, view_2d, std::layout_right>(y.data(), N, 1);
+    auto mdY = std::mdspan<data_t, view_2d, std::layout_right>(Y.data(), N, 1);
+
+    data_t *F = M.data();
+    data_t *X_ptr = X.data();
+    data_t *Y_ptr = Y.data();
+
+    ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N*N, [=](int k, auto F){
+      int i = k / N;
+      int j = k % N;
+      F[k] = WNk(N, i*j);
+    });
+
+    // initialize
+    ex::sync_wait(init);
+
+    // compute Y[n] = dft(x[n]) = WNk * x[n]
+    stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY);
+
+    // compare the computed Y[n] (dft) with X[n](fft)
+    ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr)
+    | ex::bulk(N, [](int k, auto &ret, auto X_ptr, auto Y_ptr){
+      if (!complex_compare(X_ptr[k], Y_ptr[k]))
       {
-        std::cout << "y[" << i << "] = " << X[i] << " != WNk*x[" << i << "] = " << Y[i] << std::endl;
+        //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl;
         ret = false;
       }
+    })
+    | then([](auto ret, auto &&...)
+    {
+      return ret;
     });
 
     // let the pipeline run
-    ex::sync_wait(test);
-
-    // delete the memory
-    delete[] M;
-    delete[] Y;
+    auto [re] = ex::sync_wait(verify).value();
 
-    return ret;
+    return re;
   }
 private:
   // y[n]
@@ -314,45 +337,3 @@ class signal
 };
 
 using sig_t = signal;
-
-//
-// serial fft function
-//
-void fft_serial(data_t *x, int lN, const int N)
-{
-    int stride = N/lN;
-
-    if (lN == 2)
-    {
-        auto x_0 = x[0] + x[1]* WNk(N, 0);
-        x[1] = x[0] - x[1]* WNk(N, 0);
-        x[0] = x_0;
-        return;
-    }
-
-    // vectors for even and odd index elements
-    std::vector<data_t> e(lN/2);
-    std::vector<data_t> o(lN/2);
-
-    // copy data into vectors
-    for (auto k = 0; k < lN/2; k++)
-    {
-        e[k] = x[2*k];
-        o[k] = x[2*k+1];
-    }
-
-    // compute N/2 pt FFT on even
-    fft_serial(e.data(), lN/2, N);
-
-    // compute N/2 pt FFT on odd
-    fft_serial(o.data(), lN/2, N);
-
-    // combine even and odd FFTs
-    for (int k = 0; k < lN/2; k++)
-    {
-        x[k] = e[k] + o[k] * WNk(N, k * stride);
-        x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride);
-    }
-
-    return;
-}
\ No newline at end of file
diff --git a/apps/heat-equation/CMakeLists.txt b/apps/heat-equation/CMakeLists.txt
index c886dee..f09668f 100644
--- a/apps/heat-equation/CMakeLists.txt
+++ b/apps/heat-equation/CMakeLists.txt
@@ -5,7 +5,9 @@ file(GLOB CPP_SOURCES "*.cpp")
 foreach(source_file ${CPP_SOURCES})
   if(NOT STDPAR STREQUAL "gpu")
     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}"
-                                                    MATCHES ".*gpu.*stdpar.*")
+                                                    MATCHES ".*gpu.*stdpar.*"
+                                                  OR "${source_file}"
+                                                  MATCHES ".*cuda.*")
       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
       continue()
     endif()
diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp
index 3ea2988..cfde2c5 100644
--- a/apps/heat-equation/heat-equation-cuda.cpp
+++ b/apps/heat-equation/heat-equation-cuda.cpp
@@ -157,7 +157,7 @@ __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) {
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp
index 6af69b0..696ed87 100644
--- a/apps/heat-equation/heat-equation-omp.cpp
+++ b/apps/heat-equation/heat-equation-omp.cpp
@@ -52,7 +52,7 @@ void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1,
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-serial.cpp b/apps/heat-equation/heat-equation-serial.cpp
index 1ae243b..6e07338 100644
--- a/apps/heat-equation/heat-equation-serial.cpp
+++ b/apps/heat-equation/heat-equation-serial.cpp
@@ -56,7 +56,7 @@ void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) {
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp
index 11b6cc6..7b45db6 100644
--- a/apps/heat-equation/heat-equation-stdexec.cpp
+++ b/apps/heat-equation/heat-equation-stdexec.cpp
@@ -36,7 +36,7 @@
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
@@ -155,12 +155,14 @@ int main(int argc, char* argv[]) {
     case sch_t::CPU:
       algorithm(exec::static_thread_pool(nthreads).get_scheduler());
       break;
+#if defined(USE_GPU)
     case sch_t::GPU:
       algorithm(nvexec::stream_context().get_scheduler());
       break;
     case sch_t::MULTIGPU:
       algorithm(nvexec::multi_gpu_stream_context().get_scheduler());
       break;
+#endif // USE_GPU
     default:
       throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers");
   }
diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp
index b20fb68..ed65ddc 100644
--- a/apps/heat-equation/heat-equation-stdpar.cpp
+++ b/apps/heat-equation/heat-equation-stdpar.cpp
@@ -35,7 +35,7 @@
 //
 int main(int argc, char* argv[]) {
   // parse params
-  heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
+  const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 
   // see if help wanted
   if (args.help) {
diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp
index 1a5438f..abbf0e0 100644
--- a/apps/heat-equation/heat-equation.hpp
+++ b/apps/heat-equation/heat-equation.hpp
@@ -33,13 +33,18 @@
 #include <experimental/mdspan>
 #include <stdexec/execution.hpp>
 #include <exec/static_thread_pool.hpp>
-#include <nvexec/stream_context.cuh>
-#include <nvexec/multi_gpu_context.cuh>
+
+#if defined(USE_GPU)
+  #include <nvexec/stream_context.cuh>
+  #include <nvexec/multi_gpu_context.cuh>
+using namespace nvexec;
+#endif //USE_GPU
+
 #include "argparse/argparse.hpp"
 #include "commons.hpp"
 
 namespace ex = stdexec;
-using namespace nvexec;
+
 using namespace exec;
 
 // data type
@@ -73,7 +78,11 @@ struct heat_params_t : public argparse::Args {
 #endif  // HEQ_OMP || HEQ_STDEXEC
 
 #if defined(HEQ_STDEXEC)
-  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu");
+  std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu"
+  #if defined (USE_GPU)
+                          ", gpu, multigpu"
+  #endif //USE_GPU
+                          "]").set_default("cpu");
 #endif  // HEQ_STDEXEC
 
   Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
diff --git a/include/commons.hpp b/include/commons.hpp
index 240a443..d917b42 100644
--- a/include/commons.hpp
+++ b/include/commons.hpp
@@ -92,7 +92,11 @@ enum class sch_t { CPU, GPU, MULTIGPU };
 
 [[nodiscard]] sch_t get_sch_enum(std::string_view str) {
     static const std::map<std::string_view, sch_t> schmap = {
-        {"cpu", sch_t::CPU}, {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}};
+        {"cpu", sch_t::CPU},
+#if defined (USE_GPU)
+        {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}
+#endif // USE_GPU
+};
 
     if (schmap.contains(str)) {
         return schmap.at(str);
@@ -100,6 +104,10 @@ enum class sch_t { CPU, GPU, MULTIGPU };
 
     throw std::invalid_argument("FATAL: " + std::string(str) +
                                 " is not a stdexec scheduler.\n"
-                                "Available schedulers: cpu (static thread pool), gpu, multigpu.\n"
+                                "Available schedulers: cpu"
+#if defined (USE_GPU)
+                                ", gpu, multigpu"
+#endif
+                                "\n"
                                 "Exiting...\n");
 }