Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Omp tiled #28

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions apps/choleskey/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,18 @@ target_include_directories(
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

# TODO: remove this example add_executable(choleskey_stdpar_snd
# choleskey_stdpar_snd.cpp) target_link_libraries(choleskey_stdpar_snd stdexec)
# target_include_directories( choleskey_stdpar_snd PRIVATE ${CMAKE_BINARY_DIR}
# ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR}
# ${MDSPAN_INCLUDE_DIR})
# TODO: remove this example
# add_executable(choleskey_stdpar_snd choleskey_stdpar_snd.cpp)
# target_link_libraries(choleskey_stdpar_snd stdexec)
# target_include_directories(
# choleskey_stdpar_snd
# PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
# ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

#tiled cholesky omp
add_executable(cholesky_tiled_omp cholesky_tiled_omp.cpp )
target_link_libraries(cholesky_tiled_omp PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
target_include_directories(
cholesky_tiled_omp
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
198 changes: 198 additions & 0 deletions apps/choleskey/cholesky_tiled_omp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
/*
* MIT License
*
* Copyright (c) 2023 Chuanqiu He
* Copyright (c) 2023 The Regents of the University of California,
* through Lawrence Berkeley National Laboratory (subject to receipt of any
* required approvals from the U.S. Dept. of Energy).All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

/*
* implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
* references:
* >Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
* >Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)
*/

#define CHOLESKY_OMP

#include "matrixutil.hpp"

void tiled_cholesky(data_type* matrix_split[], const std::size_t tile_size, const std::size_t num_tiles,
CBLAS_ORDER blasLay, const int lapackLay) {
std::size_t m = 0, n = 0, k = 0;

for (k = 0; k < num_tiles; ++k) { //POTRF
#pragma omp task depend(inout : matrix_split[k * num_tiles + k])
{ int info = LAPACKE_dpotrf(lapackLay, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size); }

for (m = k + 1; m < num_tiles; ++m) { //DTRSM
#pragma omp task depend(in : matrix_split[k * num_tiles + k]) depend(inout : matrix_split[m * num_tiles + k])
{
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
}
}

for (n = k + 1; n < num_tiles; ++n) { //DSYRK
#pragma omp task depend(in : matrix_split[n * num_tiles + k]) depend(inout : matrix_split[n * num_tiles + n])
{
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0,
matrix_split[n * num_tiles + k], tile_size, 1.0, matrix_split[n * num_tiles + n],
tile_size);
}
for (m = n + 1; m < num_tiles; ++m) { //DGEMM
#pragma omp task depend(in : matrix_split[m * num_tiles + k], matrix_split[n * num_tiles + k]) \
depend(inout : matrix_split[m * num_tiles + n])
{
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size,
1.0, matrix_split[m * num_tiles + n], tile_size);
}
}
}
}
}

int main(int argc, char** argv) {

// parse params
args_params_t args = argparse::parse<args_params_t>(argc, argv);

const std::size_t matrix_size = args.mat_size; // Number of matrix dimension.
const std::size_t num_tiles = args.num_tiles; // matrix size MUST be divisible
bool verifycorrectness = args.verifycorrectness; // verify tiled_cholesky results with MKL cholesky
bool layRow = args.layRow; // set the matrix in row-major order(default)
int nthreads = args.nthreads;

fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);

// Check mat_size is divisible by num_tiles
if (matrix_size % num_tiles != 0) {
fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
throw std::invalid_argument("Matrix size is not divisible by num_tiles");
}

if (matrix_size == 0) {
fmt::print("0 is an illegal input matrix_size \n");
std::exit(0);
}

const std::size_t tile_size = {matrix_size / num_tiles};
const std::size_t tot_tiles = {num_tiles * num_tiles};

data_type* A = new data_type[matrix_size * matrix_size];

// Allocate memory for tiled_cholesky for the full matrix
data_type* A_cholesky = new data_type[matrix_size * matrix_size];

// Allocate memory for MKL cholesky for the full matrix
data_type* A_MKL = new data_type[matrix_size * matrix_size];

// Memory allocation for tiled matrix
data_type** Asplit = new data_type*[tot_tiles];

for (std::size_t i = 0; i < tot_tiles; ++i) {
// Buffer per tile
Asplit[i] = new data_type[tile_size * tile_size];
}

//Generate a symmetric positve-definite matrix
A = generate_positiveDefinitionMatrix(matrix_size);

//copying matrices into separate variables for tiled cholesky (A_cholesky)
std::memcpy(A_cholesky, A, matrix_size * matrix_size * sizeof(data_type));
//copying matrices into separate variables for MKL cholesky (A_MKL)
std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(data_type));

// CBLAS_ORDER: Indicates whether a matrix is in row-major or column-major order.
CBLAS_ORDER blasLay;
int lapackLay;

if (layRow) {
blasLay = CblasRowMajor;
lapackLay = LAPACK_ROW_MAJOR;
} else {
blasLay = CblasColMajor;
lapackLay = LAPACK_COL_MAJOR;
}

//splits the input matrix into tiles
split_into_tiles(A_cholesky, Asplit, num_tiles, tile_size, matrix_size, layRow);

// Measure execution time of tiled_cholesky
Timer timer;
//run the tiled Cholesky function
#pragma omp parallel num_threads(nthreads)
{
#pragma omp master
{ tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay); }
}

auto time = timer.stop();

if (args.time) {
fmt::print("Time for tiled_cholesky decomposition(omp), Duration: {:f} ms\n", time);
}

//assembling seperated tiles back into full matrix
assemble_tiles(Asplit, A_cholesky, num_tiles, tile_size, matrix_size, layRow);

//calling LAPACKE_dpotrf cholesky for verification
// Measure execution time of MKL Cholesky decomposition
Timer timer2;
int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);
auto time2 = timer2.stop();
if (args.time) {
fmt::print("Time for MKL Cholesky decomposition, Duration: {:f} ms\n", time2);
}

if (info != 0) {
fmt::print("Error with dpotrf, info = {}\n", info);
}

if (verifycorrectness) {
bool res = verify_results(A_cholesky, A_MKL, matrix_size * matrix_size);
fmt::print("\n");
if (res) {
fmt::print("Tiled Cholesky decomposition successful.\n");
} else {
fmt::print("Tiled Cholesky decomposition failed.\n");
}
fmt::print("\n");
}

// print lower_matrix if tiled_cholesky sucessfull
if (args.lower_matrix) {
fmt::print("The lower matrix of input matrix after tiled_cholesky: \n");
printLowerResults(A_cholesky, matrix_size);
}

//memory free
delete[] A;
delete[] A_cholesky;
delete[] A_MKL;
for (std::size_t i = 0; i < tot_tiles; ++i) {
delete[] Asplit[i];
}

return 0;
}
136 changes: 131 additions & 5 deletions apps/choleskey/matrixutil.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
#pragma once

#include <iostream>
#include <vector>
#include <cblas.h>
#include <lapacke.h>
#include <omp.h>
#include <cstddef>
#include <cstdlib>
#include "argparse/argparse.hpp"
#include "commons.hpp"

using data_type = double;

// generate positive definition matrix
template <typename T>
Expand Down Expand Up @@ -30,9 +37,128 @@ std::vector<T> generate_pascal_matrix(const int n) {

// parameters define
struct args_params_t : public argparse::Args {
bool& results = kwarg("results", "print generated results (default: false)").set_default(true);
std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10);
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
bool& layRow = kwarg("layRow", "set the matrix in row-major order(false:column-major order). ").set_default(true);
std::size_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10);
std::size_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2);
bool& verifycorrectness =
kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(true);
#if defined(CHOLESKY_OMP)
int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
#endif // CHOLESKY_OMP
bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
};

// help functions for tiled_cholesky
data_type* generate_positiveDefinitionMatrix(const std::size_t matrix_size) {
data_type* A_matrix = new data_type[matrix_size * matrix_size];
data_type* pd_matrix = (data_type*)malloc(matrix_size * matrix_size * sizeof(data_type));
std::size_t seeds = matrix_size;

// generate a random symmetric matrix
for (std::size_t row = 0; row < matrix_size; ++row) {
for (std::size_t col = row; col < matrix_size; ++col) {
A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] =
(data_type)rand_r((unsigned int*)&seeds) / RAND_MAX;
}
}
// compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);

// Adjust Diagonals
for (std::size_t row = 0; row < matrix_size; ++row) {
double diagonals = 1.0; // from 1.0
for (std::size_t col = 0; col < matrix_size; ++col) {
diagonals += pd_matrix[row * matrix_size + col];
}
// Set the diag entry
pd_matrix[row * matrix_size + row] = diagonals;
}

delete[] A_matrix;
return pd_matrix;
}

void split_into_tiles(const data_type* matrix, data_type* matrix_split[], const std::size_t num_tiles,
const std::size_t tile_size, const std::size_t matrixsize, bool layRow) {

std::size_t total_num_tiles = num_tiles * num_tiles;
std::size_t offset_tile, i, j;
#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
for (std::size_t i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
if (layRow) {
offset_tile = std::size_t(i_tile / num_tiles) * num_tiles * tile_size * tile_size +
std::size_t(i_tile % num_tiles) * tile_size;
} else {
offset_tile = std::size_t(i_tile % num_tiles) * num_tiles * tile_size * tile_size +
std::size_t(i_tile / num_tiles) * tile_size;
}

for (i = 0; i < tile_size; ++i)
for (j = 0; j < tile_size; ++j) {
matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * matrixsize + j];
}
}
}

void assemble_tiles(data_type* matrix_split[], data_type* matrix, const std::size_t num_tiles,
const std::size_t tile_size, const std::size_t size, bool layRow) {
std::size_t i, j, i_tile, j_tile, tile, i_local, j_local;
#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) schedule(auto)
for (i = 0; i < size; ++i) {
i_local = std::size_t(i % tile_size);
i_tile = std::size_t(i / tile_size);
for (j = 0; j < size; ++j) {
j_tile = std::size_t(j / tile_size);
if (layRow) {
tile = i_tile * num_tiles + j_tile;
} else {
tile = j_tile * num_tiles + i_tile;
}
j_local = std::size_t(j % tile_size);
matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
}
}
}

bool verify_results(const data_type* lower_res, const data_type* dporft_res, const std::size_t totalsize) {
bool res = true;
data_type diff;
for (std::size_t i = 0; i < totalsize; ++i) {
diff = dporft_res[i] - lower_res[i];
if (fabs(dporft_res[i]) > 1e-5) {
diff /= dporft_res[i];
}
diff = fabs(diff);
if (diff > 1.0e-5) {
fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
res = false;
break;
}
}
return res;
}

void printLowerResults(const data_type* matrix, const std::size_t matrix_size) {
for (std::size_t row = 0; row < matrix_size; ++row) {
for (std::size_t col = 0; col <= row; ++col) {
fmt::print("{}\t", matrix[row * matrix_size + col]);
}
fmt::print("\n");
}
}

void print_mat_split(data_type* matrix_split[], const std::size_t num_tiles, const std::size_t tile_size) {
for (std::size_t itile = 0; itile < num_tiles * num_tiles; ++itile) {
fmt::print("Block {}:\n", itile);
for (std::size_t i = 0; i < tile_size; ++i) {
for (std::size_t j = 0; j < tile_size; ++j) {
fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
}
fmt::print("\n");
}
fmt::print("\n");
}
}
Loading