diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt index bfa1a7a..a5eae85 100644 --- a/apps/choleskey/CMakeLists.txt +++ b/apps/choleskey/CMakeLists.txt @@ -7,7 +7,15 @@ target_include_directories( ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) add_executable(choleskey_stdpar choleskey_stdpar.cpp) +target_link_libraries(choleskey_stdpar stdexec) target_include_directories( choleskey_stdpar PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + +add_executable(choleskey_stdpar_snd choleskey_stdpar_snd.cpp) +target_link_libraries(choleskey_stdpar_snd stdexec) +target_include_directories( + choleskey_stdpar_snd + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) diff --git a/apps/choleskey/choleskey_stdpar_snd.cpp b/apps/choleskey/choleskey_stdpar_snd.cpp new file mode 100644 index 0000000..4fa7d79 --- /dev/null +++ b/apps/choleskey/choleskey_stdpar_snd.cpp @@ -0,0 +1,210 @@ +/* + * MIT License + * + * Copyright (c) 2023 Chuanqiu He + * Copyright (c) 2023 Weile Wei + * Copyright (c) 2023 The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of any + * required approvals from the U.S. Dept. of Energy).All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +// +// This example provides a stdexec(senders/receivers) implementation for choleskey decomposition code. +#include +#include +#include +#include +#include +#include +#include +#include "argparse/argparse.hpp" +#include "commons.hpp" +#include "exec/static_thread_pool.hpp" + +#include "matrixutil.hpp" +// using namespace stdexec; + +using namespace std; + +struct solver { + + using view_2d = std::extents; + + template + std::vector> Cholesky_Decomposition(std::vector& vec, int n, + int np) { + + // test here first, scheduler from a thread pool + exec::static_thread_pool pool(np); + stdexec::scheduler auto sch = pool.get_scheduler(); + stdexec::sender auto begin = stdexec::schedule(sch); + + std::vector> lower(n, std::vector(n, 0)); + + auto matrix_ms = + std::mdspan(vec.data(), n, n); + + auto multiplier_lambda = [=](auto a, auto b) { + return a * b; + }; + + for (int i = 0; i < matrix_ms.extent(0); i++) { + for (int j = 0; j <= i; j++) { + // avoid over parallelize + if (j == 0) { + np = 1; + } else if (j > 0 && np > j) { + np = j; + } + + if (j == i) // summation for diagonals + { + + if (i == 0 && j == 0) { + lower[j][j] = std::sqrt(matrix_ms(i, j)); + } else { + + std::vector sum_vec(np); // sub res for each piece + int size = j; // there are j elements need to be calculated(power) + + stdexec::sender auto send1 = + stdexec::bulk(begin, np, + [&](int piece) { + int start = piece * size / np; + int chunk_size = size / np; + int remaining = size % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, + counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, [=](int val) { + return lower[j][val] * lower[j][val]; + }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); + + lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); + } + + } else { + // Evaluating L(i, j) using L(j, j) + + if (j == 0) { + lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; + } else { + + std::vector sum_vec(np); // sub res for each piece + int size_nondiag = j; + + stdexec::sender auto send2 = + stdexec::bulk( + begin, np, + [&](int piece) { + int start = piece * size_nondiag / np; + int chunk_size = size_nondiag / np; + int remaining = size_nondiag % np; + chunk_size += (piece == np - 1) ? remaining : 0; + + sum_vec[piece] = std::transform_reduce( + std::execution::par, counting_iterator(start), + counting_iterator(start + chunk_size), 0, + std ::plus{}, + [=](int k) { return lower[j][k] * lower[i][k]; }); + }) | + stdexec::then([&sum_vec]() { + return std::reduce(std::execution::par, sum_vec.begin(), + sum_vec.end()); + }); + + auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); + + lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; + } + } + } + } + return lower; + } +}; + +/////////////////////////////////////////////////////////////////////////////// +int benchmark(args_params_t const& args) { + + std::uint64_t nd = args.nd; // Number of matrix dimension. + std::uint64_t np = args.np; // Number of parallel partitions. + + std::vector inputMatrix = generate_pascal_matrix(nd); + + // Create the solver object + solver solve; + + // Measure execution time. + Timer timer; + + // start decomposation + auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); + + // Print the final results + if (args.results) { + // Displaying Lower Triangular and its Transpose + cout << setw(6) << " Lower Triangular" << setw(30) << "Transpose" << endl; + for (int i = 0; i < nd; i++) { + // Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[i][j] << "\t"; + cout << "\t"; + + // Transpose of Lower Triangular + for (int j = 0; j < nd; j++) + cout << setw(6) << res_matrix[j][i] << "\t"; + cout << endl; + } + } + + if (args.time) { + std::cout << "Duration: " << time << " ms." + << "\n"; + } + + return 0; +} + +// Driver Code for testing +int main(int argc, char* argv[]) { + + // parse params + args_params_t args = argparse::parse(argc, argv); + // see if help wanted + if (args.help) { + args.print(); // prints all variables + return 0; + } + + benchmark(args); + + return 0; +} diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp index 46206c3..44f0468 100644 --- a/apps/choleskey/matrixutil.hpp +++ b/apps/choleskey/matrixutil.hpp @@ -35,7 +35,7 @@ struct args_params_t : public argparse::Args { std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") .set_default(10); - + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); bool& help = flag("h, help", "print help"); bool& time = kwarg("t, time", "print time").set_default(true); };