diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c6ee0c..810dad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,8 +171,14 @@ endif() # need to add appropriate flags for stdexec set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread" -) + "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}") + +# add -cudalib=cublas if -stdpar=gpu +if (STDPAR STREQUAL "gpu") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU") +endif() # ############################################################################## # Add sub-directories diff --git a/apps/1d_stencil/CMakeLists.txt b/apps/1d_stencil/CMakeLists.txt index c5157d2..93f4fc2 100644 --- a/apps/1d_stencil/CMakeLists.txt +++ b/apps/1d_stencil/CMakeLists.txt @@ -13,7 +13,6 @@ target_include_directories( PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) -if("${STDPAR}" STREQUAL "gpu") # TODO, fix cmake add_executable(stencil_stdexec stencil_stdexec.cpp) target_link_libraries(stencil_stdexec stdexec) @@ -22,6 +21,7 @@ if("${STDPAR}" STREQUAL "gpu") PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) +if("${STDPAR}" STREQUAL "gpu") add_executable(stencil_cuda stencil_cuda.cpp) target_include_directories( stencil_cuda diff --git a/apps/1d_stencil/stencil_stdexec.cpp b/apps/1d_stencil/stencil_stdexec.cpp index ace96bc..4078761 100644 --- a/apps/1d_stencil/stencil_stdexec.cpp +++ b/apps/1d_stencil/stencil_stdexec.cpp @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 Weile Wei * Copyright (c) 2023 The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of any * required approvals from the U.S. Dept. of Energy).All rights reserved. @@ -27,8 +27,10 @@ // // This example provides a stdexec implementation for the 1D stencil code. #include +#if defined(USE_GPU) #include #include +#endif #include #include "argparse/argparse.hpp" @@ -45,7 +47,12 @@ struct args_params_t : public argparse::Args { bool& no_header = kwarg("no-header", "Do not print csv header row (default: false)").set_default(false); bool& help = flag("h, help", "print help"); bool& time = kwarg("t, time", "print time").set_default(true); - std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (USE_GPU) + ", gpu, multigpu" + #endif //USE_GPU + "]").set_default("cpu"); + int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); }; @@ -121,12 +128,14 @@ int benchmark(args_params_t const& args) { case sch_t::CPU: solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt); break; +#if defined(USE_GPU) case sch_t::GPU: solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt); break; case sch_t::MULTIGPU: solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt); break; +#endif // USE_GPU default: std::cerr << "Unknown scheduler type encountered." << std::endl; break; diff --git a/apps/comm-study/comm-study-no-senders.cpp b/apps/comm-study/comm-study-no-senders.cpp index 1550094..1377745 100644 --- a/apps/comm-study/comm-study-no-senders.cpp +++ b/apps/comm-study/comm-study-no-senders.cpp @@ -74,7 +74,7 @@ auto work(P& A, P& B, P& Y, int N) { // get sum(Y) - one last memcpy (not USM) D2H sum += - std::transform_reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus(), [](T &val){return val * val;}); + std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); return sum / N; } diff --git a/apps/comm-study/comm-study.cpp b/apps/comm-study/comm-study.cpp deleted file mode 100644 index 7629ce0..0000000 --- a/apps/comm-study/comm-study.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2023 The Regents of the University of California, - * through Lawrence Berkeley National Laboratory (subject to receipt of any - * required approvals from the U.S. Dept. of Energy). All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "commons.hpp" -#include "exec/static_thread_pool.hpp" - -using namespace std; -using namespace stdexec; -using stdexec::sync_wait; - -using T = double; -using time_point_t = std::chrono::system_clock::time_point; - -// must take in the pointers/vectors by reference -template -auto work(P& A, P& B, P& Y, int N) { - T sum = 0.0; - - // init A and B separately - will it cause an H2D copy? - sender auto s1 = then(just(), - [&] { - std::for_each(std::execution::par_unseq, &A[0], &A[N], - [&](T& ai) { ai = cos(M_PI / 4); }); - }) - // trigger a D2H here - | then([&] { - for (int i = 0; i < N / 3; i++) { - // read only or read-write operations - sum += A[i] / N; - - // this line if commented should not result in an H2D - // after this but it does. - // A[i] = sin(M_PI/4); - } - std::cout << std::endl; - }); - - // will it cause an H2D here? - sender auto s2 = then(just(), [&] { - std::for_each(std::execution::par_unseq, &B[0], &B[N], - [&](T& bi) { bi = sin(M_PI / 6); }); - }); - - // will s1 and s2 execute in parallel or not? - sync_wait(when_all(std::move(s1), std::move(s2))); - - // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) - sender auto s3 = - then(just(), - [&] { - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &A[0], [&](T& ai, T& bi) { return ai + bi; }); - std::transform(std::execution::par_unseq, &A[0], &A[N], &B[0], - &Y[0], [&](T& ai, T& bi) { - return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); - }); - }) - // should trigger a D2H copy of N/3 elements - | then([&] { - for (int i = 0; i < N / 3; i++) - sum += Y[i] / N; - - std::cout << std::endl; - }) - // get sum(Y) - wonder if there is another H2D as we only read it in the - // last step - | then([&] { - return std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, - std::plus()); - }); - - auto [val] = sync_wait(s3).value(); - - return sum += val; -} - -int main(int argc, char* argv[]) { - constexpr int N = 1e9; - time_point_t mark = std::chrono::system_clock::now(); - auto es = - std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - T sum = 0.0; - -#if 1 // 0 if only arrays - std::vector A(N); - std::vector B(N); - std::vector Y(N); - - mark = std::chrono::system_clock::now(); - sum = work(A, B, Y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Vectors: Elapsed Time: " << es << "s" << std::endl << std::endl; - - std::cout << fixed << "sum: " << sum << "\n"; -#endif - -#if 1 // 0 if only vectors - - // allocate memory - can we just allocate it on device only? - T* a = new T[N]; - T* b = new T[N]; - T* y = new T[N]; - - sum = 0; - mark = std::chrono::system_clock::now(); - sum = work(a, b, y, N); - es = std::chrono::duration(std::chrono::system_clock::now() - mark) - .count(); - std::cout << "Pointers: Elapsed Time: " << es << "s" << std::endl - << std::endl; - - // do not use scientific notation - std::cout << fixed << "sum: " << sum << "\n"; -#endif - - return 0; -} \ No newline at end of file diff --git a/apps/fft/CMakeLists.txt b/apps/fft/CMakeLists.txt index 25b80ce..10e89c1 100644 --- a/apps/fft/CMakeLists.txt +++ b/apps/fft/CMakeLists.txt @@ -2,6 +2,11 @@ project(fft LANGUAGES CXX) file(GLOB CPP_SOURCES "*.cpp") +# add -cudalib=cublas if -stdpar=gpu +if (STDPAR STREQUAL "gpu") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas") +endif() + foreach(source_file ${CPP_SOURCES}) if(NOT STDPAR STREQUAL "gpu") if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" @@ -18,16 +23,19 @@ foreach(source_file ${CPP_SOURCES}) add_executable(${exec_name} ${_EXCLUDE} ${source_file}) # add dependency on argparse - add_dependencies(${exec_name} argparse magic_enum) + add_dependencies(${exec_name} argparse) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX LINKER_LANGUAGE CXX) target_include_directories( ${exec_name} PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include - ${ARGPARSE_INCLUDE_DIR} ${MAGICENUM_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + + # uncomment only if using nvc++/23.1 - no need if nvc++/23.7 + # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64) - target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec) + target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas) set_target_properties( ${exec_name} diff --git a/apps/fft/fft-serial.cpp b/apps/fft/fft-serial.cpp index 38082e4..f970bfa 100644 --- a/apps/fft/fft-serial.cpp +++ b/apps/fft/fft-serial.cpp @@ -30,13 +30,74 @@ #include "fft.hpp" +// +// serial fft function +// +[[nodiscard]] std::vector fft_serial(const data_t *x, const int N, bool debug = false) +{ + std::vector x_r(N); + std::vector id(N); + + // bit shift + int shift = 32 - ilog2(N); + + // twiddle data in x[n] + for (int k = 0; k < N; k++) + { + id[k] = reverse_bits32(k) >> shift; + x_r[k] = x[id[k]]; + } + + // niterations + int niters = ilog2(N); + // local merge partition size + int lN = 2; + + // set cout precision + std::cout << std::fixed << std::setprecision(1); + + std::cout << "FFT progress: "; + + for (int k = 0; k < niters; k++, lN*=2) + { + std::cout << (100.0 * k)/niters << "%.." << std::flush; + + static Timer dtimer; + + // number of partitions + int nparts = N/lN; + int tpp = lN/2; + + if (debug) + dtimer.start(); + + // merge + for (int k = 0; k < N/2; k++) + { + // compute indices + int e = (k/tpp)*lN + (k % tpp); + auto o = e + tpp; + auto i = (k % tpp); + auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts); + x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts); + x_r[e] = tmp; + } + + if (debug) + std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl; + } + + std::cout << "100%" << std::endl; + return x_r; +} + // // simulation // int main(int argc, char* argv[]) { // parse params - fft_params_t args = argparse::parse(argc, argv); + const fft_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) @@ -64,9 +125,6 @@ int main(int argc, char* argv[]) x_n.resize(N); } - // y[n] = fft(x[n]); - sig_t y_n(x_n); - if (print_sig) { std::cout << std::endl << "x[n] = "; @@ -81,7 +139,8 @@ int main(int argc, char* argv[]) Timer timer; // fft radix-2 algorithm - fft_serial(y_n.data(), N, N); + // y[n] = fft(x[n]); + sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug))); // stop timer auto elapsed = timer.stop(); @@ -101,7 +160,7 @@ int main(int argc, char* argv[]) // validate the recursively computed fft if (validate) { - if (x_n.isFFT(y_n)) + if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl; else std::cout << "FAILED: y[n] != fft(x[n])" << std::endl; diff --git a/apps/fft/fft-stdexec-multicore.cpp b/apps/fft/fft-stdexec-multicore.cpp deleted file mode 100644 index 045bb6a..0000000 --- a/apps/fft/fft-stdexec-multicore.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * MIT License - * - * Copyright (c) 2023 The Regents of the University of California, - * through Lawrence Berkeley National Laboratory (subject to receipt of any - * required approvals from the U.S. Dept. of Energy).All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* - * commons for the fft codes - */ - -#include "fft.hpp" - -using any_void_sender = - any_sender_of; - -// -// recursive multicore fft -// -any_void_sender fft_multicore(sender auto &&snd, data_t *x, int lN, const int N, int max_threads) -{ - // current merge stride - int stride = N/lN; - - // to check parallelism - //std::cout << "lN = " << lN << ", from tid: " << std::this_thread::get_id() << std::endl; - - // if parallelism > max threads => serial - if (stride >= max_threads) - { - // TODO: can this be improved? Putting it in ex::then doesn't sync - fft_serial(x, lN, N); - return just(); - } - - // base case - if (lN == 2) - { - // TODO: can this be improved? Putting it in ex::then doesn't sync - auto x_0 = x[0] + x[1]* WNk(N, 0); - x[1] = x[0] - x[1]* WNk(N, 0); - x[0] = x_0; - - return just(); - } - - // vectors for even and odd index elements - std::vector e(lN/2); - std::vector o(lN/2); - - // array to use in bulk - std::array dat{e.data(), o.data()}; - - // local thread pool and scheduler - exec::static_thread_pool pool_loc{std::min(lN/2, max_threads)}; - ex::sender auto snd_loc = schedule(pool_loc.get_scheduler()); - - // copy even and odd indexes to vectors and split sender - ex::sender auto merge = - ex::bulk(snd, lN/2, [&](int k){ - // copy data into vectors - e[k] = x[2*k]; - o[k] = x[2*k+1]; - }) - | ex::bulk(2, [=,&dat](int k){ - // NVC++ 23.1: passing `snd` here results in (nvc++-Fatal-/path/to/tools/cpp1 TERMINATED by signal 11) - // NVC++ 23.7 goes in forever loop - - // compute N/2 pt FFT on even and odd in bulk - fft_multicore(snd_loc, dat[k], lN/2, N, max_threads); - }) - | ex::bulk(lN/2, [&](int k){ - // combine even and odd FFTs - x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); - }); - - // wait to complete - ex::sync_wait(std::move(merge)); - - // return void sender - return just(); -} - -// -// simulation -// -int main(int argc, char* argv[]) -{ - // parse params - fft_params_t args = argparse::parse(argc, argv); - - // see if help wanted - if (args.help) - { - args.print(); // prints all variables - return 0; - } - - // simulation variables - int N = args.N; - sig_type_t sig_type = sig_type_t::box; - int max_threads = args.max_threads; - //int freq = args.freq; - bool print_sig = args.print_sig; - bool print_time = args.print_time; - bool validate = args.validate; - - // x[n] signal - sig_t x_n(N, sig_type); - - if (!isPowOf2(N)) - { - N = ceilPowOf2(N); - std::cout << "INFO: N is not a power of 2. Padding zeros => N = " << N << std::endl; - - x_n.resize(N); - } - - // y[n] = fft(x[n]); - sig_t y_n(x_n); - - if (print_sig) - { - std::cout << std::endl << "x[n] = "; - x_n.printSignal(); - std::cout << std::endl; - } - - // niterations - int niters = ilog2(N); - - // thread pool and scheduler - exec::static_thread_pool pool{max_threads}; - scheduler auto sched = pool.get_scheduler(); - - // start the timer here - Timer timer; - - // fft radix-2 algorithm - fft_multicore(schedule(sched), y_n.data(), N, N, max_threads); - - // stop timer - auto elapsed = timer.stop(); - - // print the fft(x) - if (print_sig) - { - std::cout << "X(k) = "; - y_n.printSignal(); - std::cout << std::endl; - } - - // print the computation time - if (print_time) - std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl; - - // validate the recursively computed fft - if (validate) - { - if (x_n.isFFT(y_n)) - std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl; - else - std::cout << "FAILED: y[n] != fft(x[n])" << std::endl; - } - - return 0; -} diff --git a/apps/fft/fft-stdexec.cpp b/apps/fft/fft-stdexec.cpp new file mode 100644 index 0000000..9a3e7fc --- /dev/null +++ b/apps/fft/fft-stdexec.cpp @@ -0,0 +1,233 @@ +/* + * MIT License + * + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * commons for the fft codes + */ + +#define FFT_STDEXEC +#include "fft.hpp" + +// +// fft algorithm +// +[[nodiscard]] std::vector fft(const data_t *x, scheduler auto sch, const int N, const int max_threads, bool debug = false) +{ + std::vector x_rev(N); + std::vector ind(N); + + data_t *x_r = x_rev.data(); + uint32_t *id = ind.data(); + + // compute shift factor + int shift = 32 - ilog2(N); + + // twiddle bits for fft + ex::sender auto twiddle = ex::transfer_just(sch, x_r, x, id) + | ex::bulk(N, [=](int k, auto x_r, auto x, auto id){ + id[k] = reverse_bits32(k) >> shift; + x_r[k] = x[id[k]]; + }); + ex::sync_wait(std::move(twiddle)); + + // niterations + int niters = ilog2(N); + + // local merge partition size + int lN = 2; + + // set cout precision + std::cout << std::fixed << std::setprecision(1); + std::cout << "FFT progress: "; + + // transfer_just sender + ex::sender auto tx = ex::transfer_just(sch, x_r); + + // iterate until niters - lN*=2 after each iteration + for (int it = 0; it < niters; it++, lN*=2) + { + // print progress + std::cout << (100.0 * it)/niters << "%.." << std::flush; + + // debugging timer + static Timer dtimer; + + // number of partitions + int nparts = N/lN; + int tpp = lN/2; + + // display info only if debugging + if (debug) + { + dtimer.start(); + std::cout << "lN = " << lN << ", npartitions = " << nparts << ", partition size = " << tpp << std::endl; + } + + // parallel compute lN-pt FFT + ex::sender auto merge = tx | ex::bulk(N/2, [=](auto k, auto y) + { + // compute indices + int e = (k/tpp)*lN + (k % tpp); + auto o = e + tpp; + auto i = (k % tpp); + + // compute 2-pt DFT + auto tmp = y[e] + y[o] * WNk(N, i * nparts); + y[o] = y[e] - y[o] * WNk(N, i * nparts); + y[e] = tmp; + }); + + // wait for pipeline + ex::sync_wait(std::move(merge)); + + // print only if debugging + if (debug) + std::cout << "This iter time: " << dtimer.stop() << " ms" << std::endl; + } + + // print final progress mark + std::cout << "100%" << std::endl; + + // return x_rev = fft(x_r) + return x_rev; +} + +// +// simulation +// +int main(int argc, char* argv[]) +{ + // parse params + const fft_params_t args = argparse::parse(argc, argv); + + // see if help wanted + if (args.help) + { + args.print(); // prints all variables + return 0; + } + + // simulation variables + int N = args.N; + sig_type_t sig_type = sig_type_t::box; + int max_threads = args.max_threads; + //int freq = args.freq; + bool print_sig = args.print_sig; + bool print_time = args.print_time; + bool validate = args.validate; + std::string sched = args.sch; + + // x[n] signal + sig_t x_n(N, sig_type); + + if (!isPowOf2(N)) + { + N = ceilPowOf2(N); + std::cout << "INFO: N is not a power of 2. Padding zeros => N = " << N << std::endl; + + x_n.resize(N); + } + + if (print_sig) + { + std::cout << std::endl << "x[n] = "; + x_n.printSignal(); + std::cout << std::endl; + } + + // y[n] = fft(x[n]); + std::vector y(N); + + // start the timer here + Timer timer; + + // initialize stdexec scheduler + sch_t scheduler = get_sch_enum(sched); + + // launch with appropriate stdexec scheduler + switch (scheduler) { + case sch_t::CPU: + y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug); + break; +#if defined(USE_GPU) + case sch_t::GPU: + y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024*108, args.debug); + break; + case sch_t::MULTIGPU: + y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4*1024*108, args.debug); + break; +#endif // USE_GPU + default: + throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); + } + + // y[n] = fft(x[n]) + sig_t y_n(y); + + // stop timer + auto elapsed = timer.stop(); + + // print the fft(x) + if (print_sig) + { + std::cout << "X(k) = "; + y_n.printSignal(); + std::cout << std::endl; + } + + // print the computation time + if (print_time) + std::cout << "Elapsed Time: " << elapsed << " ms" << std::endl; + + // validate the recursively computed fft + if (validate) + { + bool verify = true; + // launch with appropriate stdexec scheduler + switch (scheduler) { + case sch_t::CPU: + verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler()); + break; +#if defined (USE_GPU) + case sch_t::GPU: + verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); + break; + case sch_t::MULTIGPU: + verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); + break; +#endif // USE_GPU + default: + throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); + } + + if (verify) + std::cout << "SUCCESS: y[n] == fft(x[n])" << std::endl; + else + std::cout << "FAILED: y[n] != fft(x[n])" << std::endl; + } + + return 0; +} diff --git a/apps/fft/fft.hpp b/apps/fft/fft.hpp index 1700a30..5f4e6ea 100644 --- a/apps/fft/fft.hpp +++ b/apps/fft/fft.hpp @@ -34,9 +34,17 @@ #include #include -#include -#include "exec/static_thread_pool.hpp" +#include + +#if defined(USE_GPU) + #include + #include +using namespace nvexec; +#endif //USE_GPU + +#include #include "argparse/argparse.hpp" + #include "commons.hpp" using namespace std; @@ -46,10 +54,6 @@ using stdexec::sync_wait; namespace ex = stdexec; -template -using any_sender_of = typename exec::any_receiver_ref< - stdexec::completion_signatures>::template any_sender<>; - // 2D view using view_2d = std::extents; @@ -61,7 +65,7 @@ using data_t = std::complex; enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box }; using sig_type_t = sig_type; -#if defined (__NVCOMPILER) +// map for signals std::map sigmap{{"square",sig_type_t::square}, {"sinusoid", sig_type_t::sinusoid}, {"triangle", sig_type_t::sawtooth}, {"triangle", sig_type_t::triangle}, {"sinc", sig_type_t::sinc}, {"box", sig_type_t::box}}; @@ -78,31 +82,28 @@ sig_type_t getSignal(std::string &sig) } } -#else - -// if GCC available then just return yourself -sig_type_t getSignal(sig_type_t &sig) { return sig; } - -#endif // _NVCOMPILER - // input arguments struct fft_params_t : public argparse::Args { - - // NVC++ is not supported by magic_enum -#if !defined (__NVCOMPILER) - sig_type_t& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default(box); -#else + // NVC++ is not supported by magic_enum so using strings std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box"); -#endif // !defined (__NVCOMPILER) int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); int& N = kwarg("N", "N-point FFT").set_default(1024); bool& print_sig = flag("p,print", "print x[n] and X(k)"); int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); +#if defined(FFT_STDEXEC) + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (USE_GPU) + ", gpu, multigpu" + #endif //USE_GPU + "]").set_default("cpu"); +#endif // FFT_STDEXEC + bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]"); bool& help = flag("h, help", "print help"); bool& print_time = flag("t,time", "print fft time"); + bool& debug = flag("d,debug", "print internal timers and launch configs"); }; inline bool isPowOf2(long long int x) { @@ -137,9 +138,16 @@ inline int ilog2(uint32_t x) bool complex_compare(data_t a, data_t b, double error = 0.0101) { auto r = (fabs(a.real() - b.real()) < error)? true: false; - auto i = (fabs(a.imag() - b.imag()) < error)? true: false; + return r && (fabs(a.imag() - b.imag()) < error)? true: false; +} - return (r && i); +uint32_t reverse_bits32(uint32_t x) +{ + x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1); + x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2); + x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4); + x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8); + return (x >> 16) | (x << 16); } class signal @@ -163,6 +171,11 @@ class signal y = rhs.y; } + signal(std::vector &&in) + { + y = std::move(in); + } + signal(std::vector &in) { y = std::move(in); @@ -266,47 +279,57 @@ class signal std::cout << "]" << std::endl; } - bool isFFT(signal &X, int threads = std::thread::hardware_concurrency()) + [[nodiscard]] bool isFFT(signal &X, scheduler auto sch, int maxN = 20000) { int N = y.size(); bool ret = true; - data_t *Y = new data_t[N]; - data_t * M = new data_t[N*N]; - auto A = std::mdspan(M, N, N); + if (X.len() > maxN) + { + std::cout << "Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected.." << std::endl; + } - // scheduler from a thread pool - exec::static_thread_pool ctx{std::min(threads, A.extent(0))}; - scheduler auto sch = ctx.get_scheduler(); + std::vector Y(N); + std::vector M(N*N); - ex::sender auto test = ex::bulk(schedule(sch), A.extent(0), [&](int i){ - for (auto j = 0; j < A.extent(1); j++){ - A(i, j) = WNk(N, i*j); - } - }) - // Compute fft - | ex::bulk(A.extent(0), [&](int i){ - for (auto j = 0; j < A.extent(1); j++){ - Y[i]+= A(i,j) * y[j]; - } - }) - // compare the computed fft with input - | ex::bulk(N, [&](int i){ - if (!complex_compare(X[i], Y[i])) + auto A = std::mdspan(M.data(), N, N); + auto mdy = std::mdspan(y.data(), N, 1); + auto mdY = std::mdspan(Y.data(), N, 1); + + data_t *F = M.data(); + data_t *X_ptr = X.data(); + data_t *Y_ptr = Y.data(); + + ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N*N, [=](int k, auto F){ + int i = k / N; + int j = k % N; + F[k] = WNk(N, i*j); + }); + + // initialize + ex::sync_wait(init); + + // compute Y[n] = dft(x[n]) = WNk * x[n] + stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY); + + // compare the computed Y[n] (dft) with X[n](fft) + ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr) + | ex::bulk(N, [](int k, auto &ret, auto X_ptr, auto Y_ptr){ + if (!complex_compare(X_ptr[k], Y_ptr[k])) { - std::cout << "y[" << i << "] = " << X[i] << " != WNk*x[" << i << "] = " << Y[i] << std::endl; + //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl; ret = false; } + }) + | then([](auto ret, auto &&...) + { + return ret; }); // let the pipeline run - ex::sync_wait(test); - - // delete the memory - delete[] M; - delete[] Y; + auto [re] = ex::sync_wait(verify).value(); - return ret; + return re; } private: // y[n] @@ -314,45 +337,3 @@ class signal }; using sig_t = signal; - -// -// serial fft function -// -void fft_serial(data_t *x, int lN, const int N) -{ - int stride = N/lN; - - if (lN == 2) - { - auto x_0 = x[0] + x[1]* WNk(N, 0); - x[1] = x[0] - x[1]* WNk(N, 0); - x[0] = x_0; - return; - } - - // vectors for even and odd index elements - std::vector e(lN/2); - std::vector o(lN/2); - - // copy data into vectors - for (auto k = 0; k < lN/2; k++) - { - e[k] = x[2*k]; - o[k] = x[2*k+1]; - } - - // compute N/2 pt FFT on even - fft_serial(e.data(), lN/2, N); - - // compute N/2 pt FFT on odd - fft_serial(o.data(), lN/2, N); - - // combine even and odd FFTs - for (int k = 0; k < lN/2; k++) - { - x[k] = e[k] + o[k] * WNk(N, k * stride); - x[k+lN/2] = e[k] - o[k] * WNk(N, k * stride); - } - - return; -} \ No newline at end of file diff --git a/apps/heat-equation/CMakeLists.txt b/apps/heat-equation/CMakeLists.txt index c886dee..f09668f 100644 --- a/apps/heat-equation/CMakeLists.txt +++ b/apps/heat-equation/CMakeLists.txt @@ -5,7 +5,9 @@ file(GLOB CPP_SOURCES "*.cpp") foreach(source_file ${CPP_SOURCES}) if(NOT STDPAR STREQUAL "gpu") if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" - MATCHES ".*gpu.*stdpar.*") + MATCHES ".*gpu.*stdpar.*" + OR "${source_file}" + MATCHES ".*cuda.*") message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") continue() endif() diff --git a/apps/heat-equation/heat-equation-cuda.cpp b/apps/heat-equation/heat-equation-cuda.cpp index 3ea2988..cfde2c5 100644 --- a/apps/heat-equation/heat-equation-cuda.cpp +++ b/apps/heat-equation/heat-equation-cuda.cpp @@ -157,7 +157,7 @@ __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) { // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-omp.cpp b/apps/heat-equation/heat-equation-omp.cpp index 6af69b0..696ed87 100644 --- a/apps/heat-equation/heat-equation-omp.cpp +++ b/apps/heat-equation/heat-equation-omp.cpp @@ -52,7 +52,7 @@ void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-serial.cpp b/apps/heat-equation/heat-equation-serial.cpp index 1ae243b..6e07338 100644 --- a/apps/heat-equation/heat-equation-serial.cpp +++ b/apps/heat-equation/heat-equation-serial.cpp @@ -56,7 +56,7 @@ void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) { // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation-stdexec.cpp b/apps/heat-equation/heat-equation-stdexec.cpp index 11b6cc6..7b45db6 100644 --- a/apps/heat-equation/heat-equation-stdexec.cpp +++ b/apps/heat-equation/heat-equation-stdexec.cpp @@ -36,7 +36,7 @@ // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { @@ -155,12 +155,14 @@ int main(int argc, char* argv[]) { case sch_t::CPU: algorithm(exec::static_thread_pool(nthreads).get_scheduler()); break; +#if defined(USE_GPU) case sch_t::GPU: algorithm(nvexec::stream_context().get_scheduler()); break; case sch_t::MULTIGPU: algorithm(nvexec::multi_gpu_stream_context().get_scheduler()); break; +#endif // USE_GPU default: throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers"); } diff --git a/apps/heat-equation/heat-equation-stdpar.cpp b/apps/heat-equation/heat-equation-stdpar.cpp index b20fb68..ed65ddc 100644 --- a/apps/heat-equation/heat-equation-stdpar.cpp +++ b/apps/heat-equation/heat-equation-stdpar.cpp @@ -35,7 +35,7 @@ // int main(int argc, char* argv[]) { // parse params - heat_params_t args = argparse::parse(argc, argv); + const heat_params_t args = argparse::parse(argc, argv); // see if help wanted if (args.help) { diff --git a/apps/heat-equation/heat-equation.hpp b/apps/heat-equation/heat-equation.hpp index 1a5438f..abbf0e0 100644 --- a/apps/heat-equation/heat-equation.hpp +++ b/apps/heat-equation/heat-equation.hpp @@ -33,13 +33,18 @@ #include #include #include -#include -#include + +#if defined(USE_GPU) + #include + #include +using namespace nvexec; +#endif //USE_GPU + #include "argparse/argparse.hpp" #include "commons.hpp" namespace ex = stdexec; -using namespace nvexec; + using namespace exec; // data type @@ -73,7 +78,11 @@ struct heat_params_t : public argparse::Args { #endif // HEQ_OMP || HEQ_STDEXEC #if defined(HEQ_STDEXEC) - std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu, gpu, multigpu]").set_default("cpu"); + std::string& sch = kwarg("sch", "stdexec scheduler: [options: cpu" + #if defined (USE_GPU) + ", gpu, multigpu" + #endif //USE_GPU + "]").set_default("cpu"); #endif // HEQ_STDEXEC Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); diff --git a/include/commons.hpp b/include/commons.hpp index 240a443..d917b42 100644 --- a/include/commons.hpp +++ b/include/commons.hpp @@ -92,7 +92,11 @@ enum class sch_t { CPU, GPU, MULTIGPU }; [[nodiscard]] sch_t get_sch_enum(std::string_view str) { static const std::map schmap = { - {"cpu", sch_t::CPU}, {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU}}; + {"cpu", sch_t::CPU}, +#if defined (USE_GPU) + {"gpu", sch_t::GPU}, {"multigpu", sch_t::MULTIGPU} +#endif // USE_GPU +}; if (schmap.contains(str)) { return schmap.at(str); @@ -100,6 +104,10 @@ enum class sch_t { CPU, GPU, MULTIGPU }; throw std::invalid_argument("FATAL: " + std::string(str) + " is not a stdexec scheduler.\n" - "Available schedulers: cpu (static thread pool), gpu, multigpu.\n" + "Available schedulers: cpu" +#if defined (USE_GPU) + ", gpu, multigpu" +#endif + "\n" "Exiting...\n"); }