From 46f1c4e28b67894321fb044db02e4048e43fe7ed Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 18 Oct 2023 19:50:46 -0700 Subject: [PATCH 1/3] fft gpu - perf analysis remains --- ...-stdexec-multicore.cpp => fft-stdexec.cpp} | 156 +++++++++++------- apps/fft/fft.hpp | 15 ++ 2 files changed, 107 insertions(+), 64 deletions(-) rename apps/fft/{fft-stdexec-multicore.cpp => fft-stdexec.cpp} (50%) diff --git a/apps/fft/fft-stdexec-multicore.cpp b/apps/fft/fft-stdexec.cpp similarity index 50% rename from apps/fft/fft-stdexec-multicore.cpp rename to apps/fft/fft-stdexec.cpp index 045bb6a..b93736a 100644 --- a/apps/fft/fft-stdexec-multicore.cpp +++ b/apps/fft/fft-stdexec.cpp @@ -28,78 +28,94 @@ * commons for the fft codes */ +#define FFT_STDEXEC #include "fft.hpp" +using namespace nvexec; + using any_void_sender = any_sender_of; // -// recursive multicore fft +// fft algorithm // -any_void_sender fft_multicore(sender auto &&snd, data_t *x, int lN, const int N, int max_threads) +std::vector fft(scheduler auto sch, data_t *x, const int N, const int max_threads) { - // current merge stride - int stride = N/lN; - - // to check parallelism - //std::cout << "lN = " << lN << ", from tid: " << std::this_thread::get_id() << std::endl; + std::vector x_rev(N); + std::vector ind(N); - // if parallelism > max threads => serial - if (stride >= max_threads) - { - // TODO: can this be improved? Putting it in ex::then doesn't sync - fft_serial(x, lN, N); - return just(); - } + data_t *x_r = x_rev.data(); + uint32_t *id = ind.data(); - // base case - if (lN == 2) - { - // TODO: can this be improved? Putting it in ex::then doesn't sync - auto x_0 = x[0] + x[1]* WNk(N, 0); - x[1] = x[0] - x[1]* WNk(N, 0); - x[0] = x_0; + int shift = 32 - ilog2(N); - return just(); - } + ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id) + | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){ + id[k] = reverse_bits32(k) >> shift; + x_r[k] = x[id[k]]; + }) + | ex::then([](auto &&...){}); - // vectors for even and odd index elements - std::vector e(lN/2); - std::vector o(lN/2); + ex::sync_wait(twiddle); - // array to use in bulk - std::array dat{e.data(), o.data()}; + // niterations + int niters = ilog2(N); + // local merge partition size + int lN = 2; - // local thread pool and scheduler - exec::static_thread_pool pool_loc{std::min(lN/2, max_threads)}; - ex::sender auto snd_loc = schedule(pool_loc.get_scheduler()); + // set cout precision + std::cout << std::fixed << std::setprecision(1); - // copy even and odd indexes to vectors and split sender - ex::sender auto merge = - ex::bulk(snd, lN/2, [&](int k){ - // copy data into vectors - e[k] = x[2*k]; - o[k] = x[2*k+1]; - }) - | ex::bulk(2, [=,&dat](int k){ - // NVC++ 23.1: passing `snd` here results in (nvc++-Fatal-/path/to/tools/cpp1 TERMINATED by signal 11) - // NVC++ 23.7 goes in forever loop + // transfer_just sender + ex::sender auto tx = ex::transfer_just(sch, x_r); - // compute N/2 pt FFT on even and odd in bulk - fft_multicore(snd_loc, dat[k], lN/2, N, max_threads); - }) - | ex::bulk(lN/2, [&](int k){ + for (int k = 0; k < niters; k++, lN*=2) + { + std::cout << "FFT progress: " << (100.0 * k)/niters << "%" << std::endl; + + // number of partitions + int stride = N/lN; + + if (lN < max_threads) + { + //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << stride << ", each thread = " << lN/2 << std::endl; + ex::sender auto merge = tx | ex::bulk(stride, [=](auto k, auto y) + { + // combine even and odd FFTs + for (int i = 0; i < lN/2; i++) + { + auto e = i + k*lN; + auto o = i + k*lN + lN/2; + auto tmp = y[e] + y[o] * WNk(N, i * stride); + y[o] = y[e] - y[o] * WNk(N, i * stride); + y[e] = tmp; + } + }); + + ex::sync_wait(std::move(merge)); + } + else + { + //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << lN/2 << ", x times called = " << stride << std::endl; // combine even and odd FFTs - x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); - }); - - // wait to complete - ex::sync_wait(std::move(merge)); + for (int i = 0; i < stride; i++) + { + ex::sender auto merge = tx | ex::bulk(lN/2, [=](auto k, auto y) + { + auto e = k + i*lN; + auto o = k + i*lN + lN/2; + auto tmp = y[e] + y[o] * WNk(N, k * stride); + y[o] = y[e] - y[o] * WNk(N, k * stride); + y[e] = tmp; + }); + + ex::sync_wait(std::move(merge)); + } + } + } - // return void sender - return just(); + return x_rev; } // @@ -125,6 +141,7 @@ int main(int argc, char* argv[]) bool print_sig = args.print_sig; bool print_time = args.print_time; bool validate = args.validate; + std::string sched = args.sch; // x[n] signal sig_t x_n(N, sig_type); @@ -137,9 +154,6 @@ int main(int argc, char* argv[]) x_n.resize(N); } - // y[n] = fft(x[n]); - sig_t y_n(x_n); - if (print_sig) { std::cout << std::endl << "x[n] = "; @@ -147,18 +161,32 @@ int main(int argc, char* argv[]) std::cout << std::endl; } - // niterations - int niters = ilog2(N); - - // thread pool and scheduler - exec::static_thread_pool pool{max_threads}; - scheduler auto sched = pool.get_scheduler(); + // y[n] = fft(x[n]); + std::vector y; // start the timer here Timer timer; - // fft radix-2 algorithm - fft_multicore(schedule(sched), y_n.data(), N, N, max_threads); + // initialize stdexec scheduler + sch_t scheduler = get_sch_enum(sched); + + // launch with appropriate stdexec scheduler + switch (scheduler) { + case sch_t::CPU: + y = fft(exec::static_thread_pool(max_threads).get_scheduler(), x_n.data(), N, max_threads); + break; + case sch_t::GPU: + y = fft(nvexec::stream_context().get_scheduler(), x_n.data(), N, 1024*108); + break; + case sch_t::MULTIGPU: + y = fft(nvexec::multi_gpu_stream_context().get_scheduler(), x_n.data(), N, 4*1024*108); + break; + default: + throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); + } + + // y[n] = fft(x[n]) + sig_t y_n(y); // stop timer auto elapsed = timer.stop(); diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 1700a30..ae63fdd 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -34,6 +34,8 @@ #include #include +#include +#include #include #include "exec/static_thread_pool.hpp" #include "argparse/argparse.hpp" @@ -100,6 +102,10 @@ struct fft_params_t : public argparse::Args { bool& print_sig = flag("p,print", "print x[n] and X(k)"); int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); +#if defined(FFT_STDEXEC) + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); +#endif // FFT_STDEXEC + bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]"); bool& help = flag("h, help", "print help"); bool& print_time = flag("t,time", "print fft time"); @@ -142,6 +148,15 @@ bool complex_compare(data_t a, data_t b, double error = 0.0101) return (r && i); } +uint32_t reverse_bits32(uint32_t x) +{ + x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1); + x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2); + x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4); + x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8); + return (x >> 16) | (x << 16); +} + class signal { public: From af2a10d4d5c1873960354d2536a2affc5953362d Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 19 Oct 2023 20:28:48 -0700 Subject: [PATCH 2/3] optimize fft, cmake cleanup, using linalg in DFT --- CMakeLists.txt | 9 +- apps/1d_stencil/CMakeLists.txt | 2 +- apps/1d_stencil/stencil_stdexec.cpp | 13 +- apps/comm-study/comm-study-no-senders.cpp | 2 +- apps/comm-study/comm-study.cpp | 142 ----------------- apps/fft/CMakeLists.txt | 14 +- apps/fft/fft-serial.cpp | 71 ++++++++- apps/fft/fft-stdexec.cpp | 97 ++++++------ apps/fft/fft.hpp | 154 +++++++------------ apps/heat-equation/CMakeLists.txt | 4 +- apps/heat-equation/heat-equation-stdexec.cpp | 2 + apps/heat-equation/heat-equation.hpp | 17 +- include/commons.hpp | 12 +- 13 files changed, 234 insertions(+), 305 deletions(-) delete mode 100644 apps/comm-study/comm-study.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c6ee0c..1a43330 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,9 +171,16 @@ endif() # need to add appropriate flags for stdexec set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread" + "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} -pthread" ) +# add -cudalib=cublas if -stdpar=gpu +if (STDPAR STREQUAL "gpu") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPUSTDPAR") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UGPUSTDPAR") +endif() + # ############################################################################## # Add sub-directories # ############################################################################## diff --git a/apps/1d_stencil/CMakeLists.txt b/apps/1d_stencil/CMakeLists.txt index c5157d2..93f4fc2 100644 --- a/apps/1d_stencil/CMakeLists.txt +++ b/apps/1d_stencil/CMakeLists.txt @@ -13,7 +13,6 @@ target_include_directories( PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) -if("${STDPAR}" STREQUAL "gpu") # TODO, fix cmake add_executable(stencil_stdexec stencil_stdexec.cpp) target_link_libraries(stencil_stdexec stdexec) @@ -22,6 +21,7 @@ if("${STDPAR}" STREQUAL "gpu") PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) +if("${STDPAR}" STREQUAL "gpu") add_executable(stencil_cuda stencil_cuda.cpp) target_include_directories( stencil_cuda diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp index ace96bc..e2610a1 100644 --- a/apps/1d_stencil/stencil_stdexec.cpp +++ b/apps/1d_stencil/stencil_stdexec.cpp @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 Weile Wei * Copyright (c) 2023 The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of any * required approvals from the U.S. Dept. of Energy).All rights reserved. @@ -27,8 +27,10 @@ // // This example provides a stdexec implementation for the 1D stencil code. #include +#if defined(GPUSTDPAR) #include #include +#endif #include #include "argparse/argparse.hpp" @@ -45,7 +47,12 @@ struct args_params_t : public argparse::Args { bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); bool& help = flag("h, help", "print help"); bool& time = kwarg("t, time", "print time").set_default(true); - std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (GPUSTDPAR) + ", gpu, multigpu" + #endif //GPUSTDPAR + "]").set_default("cpu"); + int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); }; @@ -121,12 +128,14 @@ int benchmark(args_params_t const& args) { case sch_t::CPU: solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt); break; +#if defined(GPUSTDPAR) case sch_t::GPU: solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt); break; case sch_t::MULTIGPU: solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt); break; +#endif // GPUSTDPAR default: std::cerr << "Unknown scheduler type encountered." << std::endl; break; diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp index 1550094..1377745 100644 --- a/apps/comm-study/comm-study-no-senders.cpp +++ b/apps/comm-study/comm-study-no-senders.cpp @@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) { // get sum(Y) - one last memcpy (not USM) D2H sum += - std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), [](T &val){return val * val;}); + std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); return sum / N; } diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp deleted file mode 100644 index 7629ce0..0000000 --- a/apps/comm-study/comm-study.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2023 The Regents of the University of California, - * through Lawrence Berkeley National Laboratory (subject to receipt of any - * required approvals from the U.S. Dept. of Energy). All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "commons.hpp" -#include "exec/static_thread_pool.hpp" - -using namespace std; -using namespace stdexec; -using stdexec::sync_wait; - -using T = double; -using time_point_t = std::chrono::system_clock::time_point; - -// must take in the pointers/vectors by reference -template -auto work(P& A, P& B, P& Y, int N) { - T sum = 0.0; - - // init A and B separately - will it cause an H2D copy? - sender auto s1 = then(just(), - [&] { - std::for_each(std::execution::par_unseq, &A[0], &A[N], - [&](T& ai) { ai = cos(M_PI / 4); }); - }) - // trigger a D2H here - | then([&] { - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; - - // this line if commented should not result in an H2D - // after this but it does. - // A[i] = sin(M_PI/4); - } - std::cout << std::endl; - }); - - // will it cause an H2D here? - sender auto s2 = then(just(), [&] { - std::for_each(std::execution::par_unseq, &B[0], &B[N], - [&](T& bi) { bi = sin(M_PI / 6); }); - }); - - // will s1 and s2 execute in parallel or not? - sync_wait(when_all(std::move(s1), std::move(s2))); - - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - sender auto s3 = - then(just(), - [&] { - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &A[0], [&](T& ai, T& bi) { return ai + bi; }); - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &Y[0], [&](T& ai, T& bi) { - return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); - }); - }) - // should trigger a D2H copy of N/3 elements - | then([&] { - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; - - std::cout << std::endl; - }) - // get sum(Y) - wonder if there is another H2D as we only read it in the - // last step - | then([&] { - return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, - std::plus()); - }); - - auto [val] = sync_wait(s3).value(); - - return sum += val; -} - -int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = - std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - T sum = 0.0; - -#if 1 // 0 if only arrays - std::vector A(N); - std::vector B(N); - std::vector Y(N); - - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; - - std::cout << fixed << "sum: " << sum << "\n"; -#endif - -#if 1 // 0 if only vectors - - // allocate memory - can we just allocate it on device only? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl - << std::endl; - - // do not use scientific notation - std::cout << fixed << "sum: " << sum << "\n"; -#endif - - return 0; -} \ No newline at end of file diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt index 25b80ce..10e89c1 100644 --- a/apps/fft/CMakeLists.txt +++ b/apps/fft/CMakeLists.txt @@ -2,6 +2,11 @@ project(fft LANGUAGES CXX) file(GLOB CPP_SOURCES "*.cpp") +# add -cudalib=cublas if -stdpar=gpu +if (STDPAR STREQUAL "gpu") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas") +endif() + foreach(source_file ${CPP_SOURCES}) if(NOT STDPAR STREQUAL "gpu") if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" @@ -18,16 +23,19 @@ foreach(source_file ${CPP_SOURCES}) add_executable(${exec_name} ${_EXCLUDE} ${source_file}) # add dependency on argparse - add_dependencies(${exec_name} argparse magic_enum) + add_dependencies(${exec_name} argparse) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX LINKER_LANGUAGE CXX) target_include_directories( ${exec_name} PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include - ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + + # uncomment only if using nvc++/23.1 - no need if nvc++/23.7 + # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64) - target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec) + target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas) set_target_properties( ${exec_name} diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index 38082e4..7400023 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -30,6 +30,67 @@ #include "fft.hpp" +// +// serial fft function +// +std::vector fft_serial(data_t *x, const int N, bool debug = false) +{ + std::vector x_r(N); + std::vector id(N); + + // bit shift + int shift = 32 - ilog2(N); + + // twiddle data in x[n] + for (int k = 0; k < N; k++) + { + id[k] = reverse_bits32(k) >> shift; + x_r[k] = x[id[k]]; + } + + // niterations + int niters = ilog2(N); + // local merge partition size + int lN = 2; + + // set cout precision + std::cout << std::fixed << std::setprecision(1); + + std::cout << "FFT progress: "; + + for (int k = 0; k < niters; k++, lN*=2) + { + std::cout << (100.0 * k)/niters << "%.." << std::flush; + + static Timer dtimer; + + // number of partitions + int nparts = N/lN; + int tpp = lN/2; + + if (debug) + dtimer.start(); + + // merge + for (int k = 0; k < N/2; k++) + { + // compute indices + int e = (k/tpp)*lN + (k % tpp); + auto o = e + tpp; + auto i = (k % tpp); + auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts); + x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts); + x_r[e] = tmp; + } + + if (debug) + std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl; + } + + std::cout << "100%" << std::endl; + return x_r; +} + // // simulation // @@ -64,9 +125,6 @@ int main(int argc, char* argv[]) x_n.resize(N); } - // y[n] = fft(x[n]); - sig_t y_n(x_n); - if (print_sig) { std::cout << std::endl << "x[n] = "; @@ -81,7 +139,10 @@ int main(int argc, char* argv[]) Timer timer; // fft radix-2 algorithm - fft_serial(y_n.data(), N, N); + auto &&y = fft_serial(x_n.data(), N, args.debug); + + // y[n] = fft(x[n]); + sig_t y_n(y); // stop timer auto elapsed = timer.stop(); @@ -101,7 +162,7 @@ int main(int argc, char* argv[]) // validate the recursively computed fft if (validate) { - if (x_n.isFFT(y_n)) + if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl; else std::cout << "FAILED: y[n] != fft(x[n])" << std::endl; diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp index b93736a..a725f6b 100644 --- a/apps/fft/fft-stdexec.cpp +++ b/apps/fft/fft-stdexec.cpp @@ -31,16 +31,10 @@ #define FFT_STDEXEC #include "fft.hpp" -using namespace nvexec; - -using any_void_sender = - any_sender_of; - // // fft algorithm // -std::vector fft(scheduler auto sch, data_t *x, const int N, const int max_threads) +std::vector fft(data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false) { std::vector x_rev(N); std::vector ind(N); @@ -66,55 +60,46 @@ std::vector fft(scheduler auto sch, data_t *x, const int N, const int ma // set cout precision std::cout << std::fixed << std::setprecision(1); + std::cout << "FFT progress: "; // transfer_just sender ex::sender auto tx = ex::transfer_just(sch, x_r); for (int k = 0; k < niters; k++, lN*=2) { - std::cout << "FFT progress: " << (100.0 * k)/niters << "%" << std::endl; + std::cout << (100.0 * k)/niters << "%.." << std::flush; + + static Timer dtimer; // number of partitions - int stride = N/lN; + int nparts = N/lN; + int tpp = lN/2; - if (lN < max_threads) + if (debug) { - //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << stride << ", each thread = " << lN/2 << std::endl; - ex::sender auto merge = tx | ex::bulk(stride, [=](auto k, auto y) - { - // combine even and odd FFTs - for (int i = 0; i < lN/2; i++) - { - auto e = i + k*lN; - auto o = i + k*lN + lN/2; - auto tmp = y[e] + y[o] * WNk(N, i * stride); - y[o] = y[e] - y[o] * WNk(N, i * stride); - y[e] = tmp; - } - }); - - ex::sync_wait(std::move(merge)); + dtimer.start(); + std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl; } - else + + ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y) { - //std::cout << "lN = " << lN << ", partition size = " << stride << ", bulk = " << lN/2 << ", x times called = " << stride << std::endl; - // combine even and odd FFTs - for (int i = 0; i < stride; i++) - { - ex::sender auto merge = tx | ex::bulk(lN/2, [=](auto k, auto y) - { - auto e = k + i*lN; - auto o = k + i*lN + lN/2; - auto tmp = y[e] + y[o] * WNk(N, k * stride); - y[o] = y[e] - y[o] * WNk(N, k * stride); - y[e] = tmp; - }); - - ex::sync_wait(std::move(merge)); - } - } + // compute indices + int e = (k/tpp)*lN + (k % tpp); + auto o = e + tpp; + auto i = (k % tpp); + auto tmp = y[e] + y[o] * WNk(N, i * nparts); + y[o] = y[e] - y[o] * WNk(N, i * nparts); + y[e] = tmp; + }); + + ex::sync_wait(std::move(merge)); + + if (debug) + std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl; } + std::cout << "100%" << std::endl; + return x_rev; } @@ -173,14 +158,16 @@ int main(int argc, char* argv[]) // launch with appropriate stdexec scheduler switch (scheduler) { case sch_t::CPU: - y = fft(exec::static_thread_pool(max_threads).get_scheduler(), x_n.data(), N, max_threads); + y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug); break; +#if defined(GPUSTDPAR) case sch_t::GPU: - y = fft(nvexec::stream_context().get_scheduler(), x_n.data(), N, 1024*108); + y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug); break; case sch_t::MULTIGPU: - y = fft(nvexec::multi_gpu_stream_context().get_scheduler(), x_n.data(), N, 4*1024*108); + y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug); break; +#endif // GPUSTDPAR default: throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); } @@ -206,7 +193,25 @@ int main(int argc, char* argv[]) // validate the recursively computed fft if (validate) { - if (x_n.isFFT(y_n)) + bool verify = true; + // launch with appropriate stdexec scheduler + switch (scheduler) { + case sch_t::CPU: + verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler()); + break; +#if defined (GPUSTDPAR) + case sch_t::GPU: + verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); + break; + case sch_t::MULTIGPU: + verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); + break; +#endif // GPUSTDPAR + default: + throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); + } + + if (verify) std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl; else std::cout << "FAILED: y[n] != fft(x[n])" << std::endl; diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index ae63fdd..73280c5 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -34,11 +34,17 @@ #include #include -#include -#include -#include -#include "exec/static_thread_pool.hpp" +#include + +#if defined(GPUSTDPAR) + #include + #include +using namespace nvexec; +#endif //GPUSTDPAR + +#include #include "argparse/argparse.hpp" + #include "commons.hpp" using namespace std; @@ -48,10 +54,6 @@ using stdexec::sync_wait; namespace ex = stdexec; -template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; - // 2D view using view_2d = std::extents; @@ -63,7 +65,7 @@ using data_t = std::complex; enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box }; using sig_type_t = sig_type; -#if defined (__NVCOMPILER) +// map for signals std::map sigmap{{"square",sig_type_t::square}, {"sinusoid", sig_type_t::sinusoid}, {"triangle", sig_type_t::sawtooth}, {"triangle", sig_type_t::triangle}, {"sinc", sig_type_t::sinc}, {"box", sig_type_t::box}}; @@ -80,22 +82,10 @@ sig_type_t getSignal(std::string &sig) } } -#else - -// if GCC available then just return yourself -sig_type_t getSignal(sig_type_t &sig) { return sig; } - -#endif // _NVCOMPILER - // input arguments struct fft_params_t : public argparse::Args { - - // NVC++ is not supported by magic_enum -#if !defined (__NVCOMPILER) - sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(box); -#else + // NVC++ is not supported by magic_enum so using strings std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box"); -#endif // !defined (__NVCOMPILER) int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); int& N = kwarg("N", "N-point FFT").set_default(1024); @@ -103,12 +93,17 @@ struct fft_params_t : public argparse::Args { int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); #if defined(FFT_STDEXEC) - std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (GPUSTDPAR) + ", gpu, multigpu" + #endif //GPUSTDPAR + "]").set_default("cpu"); #endif // FFT_STDEXEC bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]"); bool& help = flag("h, help", "print help"); bool& print_time = flag("t,time", "print fft time"); + bool& debug = flag("d,debug", "print internal timers and launch configs"); }; inline bool isPowOf2(long long int x) { @@ -143,9 +138,7 @@ inline int ilog2(uint32_t x) bool complex_compare(data_t a, data_t b, double error = 0.0101) { auto r = (fabs(a.real() - b.real()) < error)? true: false; - auto i = (fabs(a.imag() - b.imag()) < error)? true: false; - - return (r && i); + return r && (fabs(a.imag() - b.imag()) < error)? true: false; } uint32_t reverse_bits32(uint32_t x) @@ -281,93 +274,60 @@ class signal std::cout << "]" << std::endl; } - bool isFFT(signal &X, int threads = std::thread::hardware_concurrency()) + bool isFFT(signal &X, scheduler auto sch, int maxN = 20000) { int N = y.size(); bool ret = true; - data_t *Y = new data_t[N]; - data_t * M = new data_t[N*N]; - auto A = std::mdspan(M, N, N); + //int nparts = N/maxN; + //int psize = std::min(N, nparts); + //int matsize = psize * psize; - // scheduler from a thread pool - exec::static_thread_pool ctx{std::min(threads, A.extent(0))}; - scheduler auto sch = ctx.get_scheduler(); + std::vector Y(N); + std::vector M(N*N); - ex::sender auto test = ex::bulk(schedule(sch), A.extent(0), [&](int i){ - for (auto j = 0; j < A.extent(1); j++){ - A(i, j) = WNk(N, i*j); - } - }) - // Compute fft - | ex::bulk(A.extent(0), [&](int i){ - for (auto j = 0; j < A.extent(1); j++){ - Y[i]+= A(i,j) * y[j]; - } - }) - // compare the computed fft with input - | ex::bulk(N, [&](int i){ - if (!complex_compare(X[i], Y[i])) + auto A = std::mdspan(M.data(), N, N); + auto mdy = std::mdspan(y.data(), N, 1); + auto mdY = std::mdspan(Y.data(), N, 1); + + data_t *F = M.data(); + data_t *X_ptr = X.data(); + data_t *Y_ptr = Y.data(); + + ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N*N, [=](int k, auto F){ + int i = k / N; + int j = k % N; + F[k] = WNk(N, i*j); + }); + + // initialize + ex::sync_wait(init); + + // compute Y[n] = dft(x[n]) = WNk * x[n] + stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY); + + // compare the computed Y[n] (dft) with X[n](fft) + ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr) + | ex::bulk(N, [](int k, auto &ret, auto X_ptr, auto Y_ptr){ + if (!complex_compare(X_ptr[k], Y_ptr[k])) { - std::cout << "y[" << i << "] = " << X[i] << " != WNk*x[" << i << "] = " << Y[i] << std::endl; + //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl; ret = false; } + }) + | then([](auto ret, auto &&...) + { + return ret; }); // let the pipeline run - ex::sync_wait(test); + auto [re] = ex::sync_wait(verify).value(); - // delete the memory - delete[] M; - delete[] Y; - - return ret; + return re; } private: // y[n] std::vector y; }; -using sig_t = signal; - -// -// serial fft function -// -void fft_serial(data_t *x, int lN, const int N) -{ - int stride = N/lN; - - if (lN == 2) - { - auto x_0 = x[0] + x[1]* WNk(N, 0); - x[1] = x[0] - x[1]* WNk(N, 0); - x[0] = x_0; - return; - } - - // vectors for even and odd index elements - std::vector e(lN/2); - std::vector o(lN/2); - - // copy data into vectors - for (auto k = 0; k < lN/2; k++) - { - e[k] = x[2*k]; - o[k] = x[2*k+1]; - } - - // compute N/2 pt FFT on even - fft_serial(e.data(), lN/2, N); - - // compute N/2 pt FFT on odd - fft_serial(o.data(), lN/2, N); - - // combine even and odd FFTs - for (int k = 0; k < lN/2; k++) - { - x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); - } - - return; -} \ No newline at end of file +using sig_t = signal; \ No newline at end of file diff --git a/apps/heat-equation/CMakeLists.txt b/apps/heat-equation/CMakeLists.txt index c886dee..f09668f 100644 --- a/apps/heat-equation/CMakeLists.txt +++ b/apps/heat-equation/CMakeLists.txt @@ -5,7 +5,9 @@ file(GLOB CPP_SOURCES "*.cpp") foreach(source_file ${CPP_SOURCES}) if(NOT STDPAR STREQUAL "gpu") if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" - MATCHES ".*gpu.*stdpar.*") + MATCHES ".*gpu.*stdpar.*" + OR "${source_file}" + MATCHES ".*cuda.*") message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") continue() endif() diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp index 11b6cc6..84dd968 100644 --- a/apps/heat-equation/heat-equation-stdexec.cpp +++ b/apps/heat-equation/heat-equation-stdexec.cpp @@ -155,12 +155,14 @@ int main(int argc, char* argv[]) { case sch_t::CPU: algorithm(exec::static_thread_pool(nthreads).get_scheduler()); break; +#if defined(GPUSTDPAR) case sch_t::GPU: algorithm(nvexec::stream_context().get_scheduler()); break; case sch_t::MULTIGPU: algorithm(nvexec::multi_gpu_stream_context().get_scheduler()); break; +#endif // GPUSTDPAR default: throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers"); } diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp index 1a5438f..17fc466 100644 --- a/apps/heat-equation/heat-equation.hpp +++ b/apps/heat-equation/heat-equation.hpp @@ -33,13 +33,18 @@ #include #include #include -#include -#include + +#if defined(GPUSTDPAR) + #include + #include +using namespace nvexec; +#endif //GPUSTDPAR + #include "argparse/argparse.hpp" #include "commons.hpp" namespace ex = stdexec; -using namespace nvexec; + using namespace exec; // data type @@ -73,7 +78,11 @@ struct heat_params_t : public argparse::Args { #endif // HEQ_OMP || HEQ_STDEXEC #if defined(HEQ_STDEXEC) - std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (GPUSTDPAR) + ", gpu, multigpu" + #endif //GPUSTDPAR + "]").set_default("cpu"); #endif // HEQ_STDEXEC Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); diff --git a/include/commons.hpp b/include/commons.hpp index 240a443..584a11b 100644 --- a/include/commons.hpp +++ b/include/commons.hpp @@ -92,7 +92,11 @@ enum class sch_t { CPU, GPU, MULTIGPU }; [[nodiscard]] sch_t get_sch_enum(std::string_view str) { static const std::map schmap = { - {"cpu", sch_t::CPU}, {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}}; + {"cpu", sch_t::CPU}, +#if defined (GPUSTDPAR) + {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU} +#endif // GPUSTDPAR +}; if (schmap.contains(str)) { return schmap.at(str); @@ -100,6 +104,10 @@ enum class sch_t { CPU, GPU, MULTIGPU }; throw std::invalid_argument("FATAL: " + std::string(str) + " is not a stdexec scheduler.\n" - "Available schedulers: cpu (static thread pool), gpu, multigpu.\n" + "Available schedulers: cpu" +#if defined (GPUSTDPAR) + ", gpu, multigpu" +#endif + "\n" "Exiting...\n"); } From 99c4ce5e27469db5106689cab98144fedf84f094 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 20 Oct 2023 13:13:49 -0700 Subject: [PATCH 3/3] updates for review on #44 --- CMakeLists.txt | 7 ++-- apps/1d_stencil/stencil_stdexec.cpp | 10 +++--- apps/fft/fft-serial.cpp | 8 ++--- apps/fft/fft-stdexec.cpp | 38 +++++++++++++------- apps/fft/fft.hpp | 24 ++++++++----- apps/heat-equation/heat-equation-cuda.cpp | 2 +- apps/heat-equation/heat-equation-omp.cpp | 2 +- apps/heat-equation/heat-equation-serial.cpp | 2 +- apps/heat-equation/heat-equation-stdexec.cpp | 6 ++-- apps/heat-equation/heat-equation-stdpar.cpp | 2 +- apps/heat-equation/heat-equation.hpp | 8 ++--- include/commons.hpp | 6 ++-- 12 files changed, 65 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a43330..810dad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,14 +171,13 @@ endif() # need to add appropriate flags for stdexec set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} -pthread" -) + "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}") # add -cudalib=cublas if -stdpar=gpu if (STDPAR STREQUAL "gpu") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGPUSTDPAR") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UGPUSTDPAR") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU") endif() # ############################################################################## diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp index e2610a1..4078761 100644 --- a/apps/1d_stencil/stencil_stdexec.cpp +++ b/apps/1d_stencil/stencil_stdexec.cpp @@ -27,7 +27,7 @@ // // This example provides a stdexec implementation for the 1D stencil code. #include -#if defined(GPUSTDPAR) +#if defined(USE_GPU) #include #include #endif @@ -48,9 +48,9 @@ struct args_params_t : public argparse::Args { bool& help = flag("h, help", "print help"); bool& time = kwarg("t, time", "print time").set_default(true); std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" - #if defined (GPUSTDPAR) + #if defined (USE_GPU) ", gpu, multigpu" - #endif //GPUSTDPAR + #endif //USE_GPU "]").set_default("cpu"); int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); @@ -128,14 +128,14 @@ int benchmark(args_params_t const& args) { case sch_t::CPU: solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt); break; -#if defined(GPUSTDPAR) +#if defined(USE_GPU) case sch_t::GPU: solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt); break; case sch_t::MULTIGPU: solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt); break; -#endif // GPUSTDPAR +#endif // USE_GPU default: std::cerr << "Unknown scheduler type encountered." << std::endl; break; diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index 7400023..f970bfa 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -33,7 +33,7 @@ // // serial fft function // -std::vector fft_serial(data_t *x, const int N, bool debug = false) +[[nodiscard]] std::vector fft_serial(const data_t *x, const int N, bool debug = false) { std::vector x_r(N); std::vector id(N); @@ -97,7 +97,7 @@ std::vector fft_serial(data_t *x, const int N, bool debug = false) int main(int argc, char* argv[]) { // parse params - fft_params_t args = argparse::parse(argc, argv); + const fft_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) @@ -139,10 +139,8 @@ int main(int argc, char* argv[]) Timer timer; // fft radix-2 algorithm - auto &&y = fft_serial(x_n.data(), N, args.debug); - // y[n] = fft(x[n]); - sig_t y_n(y); + sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug))); // stop timer auto elapsed = timer.stop(); diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp index a725f6b..9a3e7fc 100644 --- a/apps/fft/fft-stdexec.cpp +++ b/apps/fft/fft-stdexec.cpp @@ -34,7 +34,7 @@ // // fft algorithm // -std::vector fft(data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false) +[[nodiscard]] std::vector fft(const data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false) { std::vector x_rev(N); std::vector ind(N); @@ -42,19 +42,20 @@ std::vector fft(data_t *x, scheduler auto sch, const int N, const int ma data_t *x_r = x_rev.data(); uint32_t *id = ind.data(); + // compute shift factor int shift = 32 - ilog2(N); + // twiddle bits for fft ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id) | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){ id[k] = reverse_bits32(k) >> shift; x_r[k] = x[id[k]]; - }) - | ex::then([](auto &&...){}); - - ex::sync_wait(twiddle); + }); + ex::sync_wait(std::move(twiddle)); // niterations int niters = ilog2(N); + // local merge partition size int lN = 2; @@ -65,41 +66,52 @@ std::vector fft(data_t *x, scheduler auto sch, const int N, const int ma // transfer_just sender ex::sender auto tx = ex::transfer_just(sch, x_r); - for (int k = 0; k < niters; k++, lN*=2) + // iterate until niters - lN*=2 after each iteration + for (int it = 0; it < niters; it++, lN*=2) { - std::cout << (100.0 * k)/niters << "%.." << std::flush; + // print progress + std::cout << (100.0 * it)/niters << "%.." << std::flush; + // debugging timer static Timer dtimer; // number of partitions int nparts = N/lN; int tpp = lN/2; + // display info only if debugging if (debug) { dtimer.start(); std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl; } + // parallel compute lN-pt FFT ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y) { // compute indices int e = (k/tpp)*lN + (k % tpp); auto o = e + tpp; auto i = (k % tpp); + + // compute 2-pt DFT auto tmp = y[e] + y[o] * WNk(N, i * nparts); y[o] = y[e] - y[o] * WNk(N, i * nparts); y[e] = tmp; }); + // wait for pipeline ex::sync_wait(std::move(merge)); + // print only if debugging if (debug) std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl; } + // print final progress mark std::cout << "100%" << std::endl; + // return x_rev = fft(x_r) return x_rev; } @@ -109,7 +121,7 @@ std::vector fft(data_t *x, scheduler auto sch, const int N, const int ma int main(int argc, char* argv[]) { // parse params - fft_params_t args = argparse::parse(argc, argv); + const fft_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) @@ -147,7 +159,7 @@ int main(int argc, char* argv[]) } // y[n] = fft(x[n]); - std::vector y; + std::vector y(N); // start the timer here Timer timer; @@ -160,14 +172,14 @@ int main(int argc, char* argv[]) case sch_t::CPU: y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug); break; -#if defined(GPUSTDPAR) +#if defined(USE_GPU) case sch_t::GPU: y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug); break; case sch_t::MULTIGPU: y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug); break; -#endif // GPUSTDPAR +#endif // USE_GPU default: throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); } @@ -199,14 +211,14 @@ int main(int argc, char* argv[]) case sch_t::CPU: verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler()); break; -#if defined (GPUSTDPAR) +#if defined (USE_GPU) case sch_t::GPU: verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); break; case sch_t::MULTIGPU: verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); break; -#endif // GPUSTDPAR +#endif // USE_GPU default: throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); } diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 73280c5..5f4e6ea 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -36,11 +36,11 @@ #include #include -#if defined(GPUSTDPAR) +#if defined(USE_GPU) #include #include using namespace nvexec; -#endif //GPUSTDPAR +#endif //USE_GPU #include #include "argparse/argparse.hpp" @@ -94,9 +94,9 @@ struct fft_params_t : public argparse::Args { #if defined(FFT_STDEXEC) std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" - #if defined (GPUSTDPAR) + #if defined (USE_GPU) ", gpu, multigpu" - #endif //GPUSTDPAR + #endif //USE_GPU "]").set_default("cpu"); #endif // FFT_STDEXEC @@ -171,6 +171,11 @@ class signal y = rhs.y; } + signal(std::vector &&in) + { + y = std::move(in); + } + signal(std::vector &in) { y = std::move(in); @@ -274,14 +279,15 @@ class signal std::cout << "]" << std::endl; } - bool isFFT(signal &X, scheduler auto sch, int maxN = 20000) + [[nodiscard]] bool isFFT(signal &X, scheduler auto sch, int maxN = 20000) { int N = y.size(); bool ret = true; - //int nparts = N/maxN; - //int psize = std::min(N, nparts); - //int matsize = psize * psize; + if (X.len() > maxN) + { + std::cout << "Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected.." << std::endl; + } std::vector Y(N); std::vector M(N*N); @@ -330,4 +336,4 @@ class signal std::vector y; }; -using sig_t = signal; \ No newline at end of file +using sig_t = signal; diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp index 3ea2988..cfde2c5 100644 --- a/apps/heat-equation/heat-equation-cuda.cpp +++ b/apps/heat-equation/heat-equation-cuda.cpp @@ -157,7 +157,7 @@ __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) { // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp index 6af69b0..696ed87 100644 --- a/apps/heat-equation/heat-equation-omp.cpp +++ b/apps/heat-equation/heat-equation-omp.cpp @@ -52,7 +52,7 @@ void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-serial.cpp b/apps/heat-equation/heat-equation-serial.cpp index 1ae243b..6e07338 100644 --- a/apps/heat-equation/heat-equation-serial.cpp +++ b/apps/heat-equation/heat-equation-serial.cpp @@ -56,7 +56,7 @@ void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) { // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp index 84dd968..7b45db6 100644 --- a/apps/heat-equation/heat-equation-stdexec.cpp +++ b/apps/heat-equation/heat-equation-stdexec.cpp @@ -36,7 +36,7 @@ // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { @@ -155,14 +155,14 @@ int main(int argc, char* argv[]) { case sch_t::CPU: algorithm(exec::static_thread_pool(nthreads).get_scheduler()); break; -#if defined(GPUSTDPAR) +#if defined(USE_GPU) case sch_t::GPU: algorithm(nvexec::stream_context().get_scheduler()); break; case sch_t::MULTIGPU: algorithm(nvexec::multi_gpu_stream_context().get_scheduler()); break; -#endif // GPUSTDPAR +#endif // USE_GPU default: throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers"); } diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp index b20fb68..ed65ddc 100644 --- a/apps/heat-equation/heat-equation-stdpar.cpp +++ b/apps/heat-equation/heat-equation-stdpar.cpp @@ -35,7 +35,7 @@ // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp index 17fc466..abbf0e0 100644 --- a/apps/heat-equation/heat-equation.hpp +++ b/apps/heat-equation/heat-equation.hpp @@ -34,11 +34,11 @@ #include #include -#if defined(GPUSTDPAR) +#if defined(USE_GPU) #include #include using namespace nvexec; -#endif //GPUSTDPAR +#endif //USE_GPU #include "argparse/argparse.hpp" #include "commons.hpp" @@ -79,9 +79,9 @@ struct heat_params_t : public argparse::Args { #if defined(HEQ_STDEXEC) std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" - #if defined (GPUSTDPAR) + #if defined (USE_GPU) ", gpu, multigpu" - #endif //GPUSTDPAR + #endif //USE_GPU "]").set_default("cpu"); #endif // HEQ_STDEXEC diff --git a/include/commons.hpp b/include/commons.hpp index 584a11b..d917b42 100644 --- a/include/commons.hpp +++ b/include/commons.hpp @@ -93,9 +93,9 @@ enum class sch_t { CPU, GPU, MULTIGPU }; [[nodiscard]] sch_t get_sch_enum(std::string_view str) { static const std::map schmap = { {"cpu", sch_t::CPU}, -#if defined (GPUSTDPAR) +#if defined (USE_GPU) {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU} -#endif // GPUSTDPAR +#endif // USE_GPU }; if (schmap.contains(str)) { @@ -105,7 +105,7 @@ enum class sch_t { CPU, GPU, MULTIGPU }; throw std::invalid_argument("FATAL: " + std::string(str) + " is not a stdexec scheduler.\n" "Available schedulers: cpu" -#if defined (GPUSTDPAR) +#if defined (USE_GPU) ", gpu, multigpu" #endif "\n"