From f18b4a7ce1ec54374ad4880e718e7102887f5381 Mon Sep 17 00:00:00 2001 From: hcq9102 Date: Mon, 4 Dec 2023 14:45:29 -0800 Subject: [PATCH] cholesky_tiled_openmp implement --- apps/choleskey/CMakeLists.txt | 8 ++ apps/choleskey/cholesky_tiled_omp.cpp | 166 ++++++++++++++++++++++++ apps/choleskey/matrixutil.hpp | 179 ++++++++++++++++++++++---- 3 files changed, 326 insertions(+), 27 deletions(-) create mode 100644 apps/choleskey/cholesky_tiled_omp.cpp diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt index fb87553..a78fe9d 100644 --- a/apps/choleskey/CMakeLists.txt +++ b/apps/choleskey/CMakeLists.txt @@ -21,3 +21,11 @@ target_include_directories( # choleskey_stdpar_snd # PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include # ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) + +#tiled cholesky +add_executable(cholesky_tiled cholesky_tiled.cpp) +target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran) +target_include_directories( + cholesky_tiled + PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include + ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS) diff --git a/apps/choleskey/cholesky_tiled_omp.cpp b/apps/choleskey/cholesky_tiled_omp.cpp new file mode 100644 index 0000000..540edaa --- /dev/null +++ b/apps/choleskey/cholesky_tiled_omp.cpp @@ -0,0 +1,166 @@ +/* +Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks. + references: + >Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53. + >Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.) + +Additionally, include openblas library when build: + or $ export OPENBLAS_DIR=/openblas/path +*/ +#define CHOLESKY_OMP + +#include +#include +#include +#include +#include "matrixutil.hpp" + +void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, const CBLAS_ORDER blasLay, + const int lapackLay) { + unsigned int m = 0, n = 0, k = 0; + + for (k = 0; k < num_tiles; ++k) { //POTRF +#pragma omp task depend(inout : matrix_split[k * num_tiles + k]) + { int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size); } + + for (m = k + 1; m < num_tiles; ++m) { //DTRSM +#pragma omp task depend(in : matrix_split[k * num_tiles + k]) depend(inout : matrix_split[m * num_tiles + k]) + { + cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, + matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size); + } + } + + for (n = k + 1; n < num_tiles; ++n) { //DSYRK +#pragma omp task depend(in : matrix_split[n * num_tiles + k]) depend(inout : matrix_split[n * num_tiles + n]) + { + cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, + matrix_split[n * num_tiles + k], tile_size, 1.0, matrix_split[n * num_tiles + n], + tile_size); + } + + for (m = n + 1; m < num_tiles; ++m) { //DGEMM +#pragma omp task depend(in : matrix_split[m * num_tiles + k], matrix_split[n * num_tiles + k]) \ + depend(inout : matrix_split[m * num_tiles + n]) + { + cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0, + matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size, + 1.0, matrix_split[m * num_tiles + n], tile_size); + } + } + } + } +} + +int main(int argc, char** argv) { + // parse params + args_params_t args = argparse::parse(argc, argv); + int tile_size = 0, tot_tiles = 0; + + std::uint64_t matrix_size = args.mat_size; // Number of matrix dimension. + std::uint64_t num_tiles = args.num_tiles; // matrix size MUST be divisible + int verifycorrectness = args.verifycorrectness; + int nthreads = args.nthreads; + + bool layRow = true; + + fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles); + + // Check mat_size is divisible by num_tiles + if (matrix_size % num_tiles != 0) { + fmt::print("matrix size must be divisible by num_tiles.. aborting\n"); + throw std::invalid_argument("Matrix size is not divisible by num_tiles"); + } + + if (matrix_size == 0) { + fmt::print(" 0 illegal input matrix_size \n"); + std::exit(0); + } + + tile_size = matrix_size / num_tiles; + tot_tiles = num_tiles * num_tiles; + + double* A = new double[matrix_size * matrix_size]; + + // Allocate memory for tiled_cholesky for the full matrix + double* A_lower = new double[matrix_size * matrix_size]; + + // Allocate memory for MKL cholesky for the full matrix + double* A_MKL = new double[matrix_size * matrix_size]; + + // Memory allocation for tiled matrix + double** Asplit = new double*[tot_tiles]; + + for (int i = 0; i < tot_tiles; ++i) { + // Buffer per tile + Asplit[i] = new double[tile_size * tile_size]; + } + + //Generate a symmetric positve-definite matrix + A = generate_positiveDefinitionMatrix(matrix_size); + + //printMatrix(A, mat_size_m); + + CBLAS_ORDER blasLay; + int lapackLay; + + if (layRow) { + blasLay = CblasRowMajor; + lapackLay = LAPACK_ROW_MAJOR; + } else { + blasLay = CblasColMajor; + lapackLay = LAPACK_COL_MAJOR; + } + + //copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL) + std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double)); + std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double)); + + //splits the input matrix into tiles + split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow); + + // Measure execution time. + Timer timer; + //run the tiled Cholesky function +#pragma omp parallel num_threads(nthreads) + { +#pragma omp master + { tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay); } + } + + //assembling seperated tiles back into full matrix + assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow); + auto time = timer.stop(); + if (args.time) { + fmt::print("Duration: {:f} ms\n", time); + } + + //calling LAPACKE_dpotrf cholesky for verification and timing comparison + int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size); + + if (verifycorrectness == 1) { + bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size); + if (res) { + fmt::print("Tiled Cholesky decomposition successful\n"); + } else { + fmt::print("Tiled Cholesky decomposition failed\n"); + } + fmt::print("\n"); + } + + // print lower_matrix if tiled_cholesky sucessfull + if (args.lower_matrix) { + fmt::print("The lower matrix of input matrix after tiled_cholesky: \n"); + printLowerResults(A_lower, matrix_size); + } + + //memory free + delete[] A; + delete[] A_lower; + delete[] A_MKL; + for (int i = 0; i < tot_tiles; ++i) { + delete[] Asplit[i]; + } + + return 0; +} \ No newline at end of file diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp index 44f0468..5eada4d 100644 --- a/apps/choleskey/matrixutil.hpp +++ b/apps/choleskey/matrixutil.hpp @@ -1,7 +1,12 @@ #pragma once -#include -#include +#include +#include +#include +#include +#include +#include "argparse/argparse.hpp" +#include "commons.hpp" // generate positive definition matrix template @@ -9,33 +14,153 @@ using Matrix = std::vector>; template std::vector generate_pascal_matrix(const int n) { - Matrix matrix(n, std::vector(n, static_cast(0))); - - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - if (i == 0 || j == 0) { - matrix[i][j] = static_cast(1); - } else { - matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; - } - } - } - - std::vector flattenedVector; - for (const auto& row : matrix) { - flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); - } - return std::move(flattenedVector); + Matrix matrix(n, std::vector(n, static_cast(0))); + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + if (i == 0 || j == 0) { + matrix[i][j] = static_cast(1); + } else { + matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; + } + } + } + + std::vector flattenedVector; + for (const auto& row : matrix) { + flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); + } + return std::move(flattenedVector); } // parameters define struct args_params_t : public argparse::Args { - bool& results = kwarg("results", "print generated results (default: false)") - .set_default(true); - std::uint64_t& nd = - kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") - .set_default(10); - std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); - bool& help = flag("h, help", "print help"); - bool& time = kwarg("t, time", "print time").set_default(true); + std::uint64_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10); + std::uint64_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2); + int& verifycorrectness = + kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(1); +#if defined(CHOLESKY_OMP) + int& nthreads = kwarg("nthreads", "number of threads").set_default(1); +#endif // CHOLESKY_OMP + bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true); + bool& help = flag("h, help", "print help"); + bool& time = kwarg("t, time", "print time").set_default(true); + std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); }; + +// help functions for tiled_cholesky +double* generate_positiveDefinitionMatrix(const size_t matrix_size) { + double* A_matrix = new double[matrix_size * matrix_size]; + double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double)); + unsigned int seeds = matrix_size; + + // generate a random symmetric matrix + for (size_t row = 0; row < matrix_size; ++row) { + for (size_t col = row; col < matrix_size; ++col) { + A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX; + } + } + // compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix. + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix, + matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size); + + // Adjust Diagonals + for (size_t row = 0; row < matrix_size; ++row) { + double diagonals = 1.0; // from 1.0 + for (size_t col = 0; col < matrix_size; ++col) { + diagonals += pd_matrix[row * matrix_size + col]; + } + // Set the diag entry + pd_matrix[row * matrix_size + row] = diagonals; + } + + delete[] A_matrix; + return pd_matrix; +} + +void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size, + const int size, bool layRow) { + + int total_num_tiles = num_tiles * num_tiles; + int offset_tile; + + //#pragma omp parallel for private(i, j, offset_tile) schedule(auto) + for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) { + if (layRow) { + offset_tile = + int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size; + } else { + offset_tile = + int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size; + } + + for (int i = 0; i < tile_size; ++i) + //#pragma simd + for (int j = 0; j < tile_size; ++j) { + matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j]; + } + } +} + +double* assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size, + bool layRow) { + int i_tile, j_tile, tile, i_local, j_local; + //#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \ + schedule(auto) + for (int i = 0; i < size; ++i) { + i_local = int(i % tile_size); + i_tile = int(i / tile_size); + //#pragma simd private(j_tile, tile, j_local) + for (int j = 0; j < size; ++j) { + j_tile = int(j / tile_size); + if (layRow) { + tile = i_tile * num_tiles + j_tile; + } else { + tile = j_tile * num_tiles + i_tile; + } + j_local = int(j % tile_size); + matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local]; + } + } + return matrix; +} + +bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) { + bool res = true; + double diff; + for (int i = 0; i < totalsize; ++i) { + diff = dporft_res[i] - lower_res[i]; + if (fabs(dporft_res[i]) > 1e-5) { + diff /= dporft_res[i]; + } + diff = fabs(diff); + if (diff > 1.0e-5) { + fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]); + res = false; + break; + } + } + return res; +} + +void printLowerResults(const double* matrix, size_t matrix_size) { + for (size_t row = 0; row < matrix_size; ++row) { + for (size_t col = 0; col <= row; ++col) { + fmt::print("{}\t", matrix[row * matrix_size + col]); + } + fmt::print("\n"); + } +} + +void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) { + for (int itile = 0; itile < num_tiles * num_tiles; ++itile) { + fmt::print("Block {}:\n", itile); + for (int i = 0; i < tile_size; ++i) { + for (int j = 0; j < tile_size; ++j) { + fmt::print("{} ", matrix_split[itile][i * tile_size + j]); + } + fmt::print("\n"); + } + fmt::print("\n"); + } +} \ No newline at end of file