Skip to content

Commit

Permalink
cholesky_tiled_openmp implement
Browse files Browse the repository at this point in the history
  • Loading branch information
hcq9102 committed Dec 5, 2023
1 parent 7737bd6 commit f18b4a7
Show file tree
Hide file tree
Showing 3 changed files with 326 additions and 27 deletions.
8 changes: 8 additions & 0 deletions apps/choleskey/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ target_include_directories(
# choleskey_stdpar_snd
# PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
# ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

#tiled cholesky
add_executable(cholesky_tiled cholesky_tiled.cpp)
target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
target_include_directories(
cholesky_tiled
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
166 changes: 166 additions & 0 deletions apps/choleskey/cholesky_tiled_omp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
references:
>Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
>Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)
Additionally, include openblas library when build:
or $ export OPENBLAS_DIR=/openblas/path
*/
#define CHOLESKY_OMP

#include <bits/stdc++.h>
#include <omp.h>
#include <cmath>
#include <cstring>
#include "matrixutil.hpp"

void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, const CBLAS_ORDER blasLay,
const int lapackLay) {
unsigned int m = 0, n = 0, k = 0;

for (k = 0; k < num_tiles; ++k) { //POTRF
#pragma omp task depend(inout : matrix_split[k * num_tiles + k])
{ int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size); }

for (m = k + 1; m < num_tiles; ++m) { //DTRSM
#pragma omp task depend(in : matrix_split[k * num_tiles + k]) depend(inout : matrix_split[m * num_tiles + k])
{
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
}
}

for (n = k + 1; n < num_tiles; ++n) { //DSYRK
#pragma omp task depend(in : matrix_split[n * num_tiles + k]) depend(inout : matrix_split[n * num_tiles + n])
{
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0,
matrix_split[n * num_tiles + k], tile_size, 1.0, matrix_split[n * num_tiles + n],
tile_size);
}

for (m = n + 1; m < num_tiles; ++m) { //DGEMM
#pragma omp task depend(in : matrix_split[m * num_tiles + k], matrix_split[n * num_tiles + k]) \
depend(inout : matrix_split[m * num_tiles + n])
{
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size,
1.0, matrix_split[m * num_tiles + n], tile_size);
}
}
}
}
}

int main(int argc, char** argv) {
// parse params
args_params_t args = argparse::parse<args_params_t>(argc, argv);
int tile_size = 0, tot_tiles = 0;

std::uint64_t matrix_size = args.mat_size; // Number of matrix dimension.
std::uint64_t num_tiles = args.num_tiles; // matrix size MUST be divisible
int verifycorrectness = args.verifycorrectness;
int nthreads = args.nthreads;

bool layRow = true;

fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);

// Check mat_size is divisible by num_tiles
if (matrix_size % num_tiles != 0) {
fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
throw std::invalid_argument("Matrix size is not divisible by num_tiles");
}

if (matrix_size == 0) {
fmt::print(" 0 illegal input matrix_size \n");
std::exit(0);
}

tile_size = matrix_size / num_tiles;
tot_tiles = num_tiles * num_tiles;

double* A = new double[matrix_size * matrix_size];

// Allocate memory for tiled_cholesky for the full matrix
double* A_lower = new double[matrix_size * matrix_size];

// Allocate memory for MKL cholesky for the full matrix
double* A_MKL = new double[matrix_size * matrix_size];

// Memory allocation for tiled matrix
double** Asplit = new double*[tot_tiles];

for (int i = 0; i < tot_tiles; ++i) {
// Buffer per tile
Asplit[i] = new double[tile_size * tile_size];
}

//Generate a symmetric positve-definite matrix
A = generate_positiveDefinitionMatrix(matrix_size);

//printMatrix(A, mat_size_m);

CBLAS_ORDER blasLay;
int lapackLay;

if (layRow) {
blasLay = CblasRowMajor;
lapackLay = LAPACK_ROW_MAJOR;
} else {
blasLay = CblasColMajor;
lapackLay = LAPACK_COL_MAJOR;
}

//copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL)
std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double));
std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double));

//splits the input matrix into tiles
split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow);

// Measure execution time.
Timer timer;
//run the tiled Cholesky function
#pragma omp parallel num_threads(nthreads)
{
#pragma omp master
{ tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay); }
}

//assembling seperated tiles back into full matrix
assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow);
auto time = timer.stop();
if (args.time) {
fmt::print("Duration: {:f} ms\n", time);
}

//calling LAPACKE_dpotrf cholesky for verification and timing comparison
int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);

if (verifycorrectness == 1) {
bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size);
if (res) {
fmt::print("Tiled Cholesky decomposition successful\n");
} else {
fmt::print("Tiled Cholesky decomposition failed\n");
}
fmt::print("\n");
}

// print lower_matrix if tiled_cholesky sucessfull
if (args.lower_matrix) {
fmt::print("The lower matrix of input matrix after tiled_cholesky: \n");
printLowerResults(A_lower, matrix_size);
}

//memory free
delete[] A;
delete[] A_lower;
delete[] A_MKL;
for (int i = 0; i < tot_tiles; ++i) {
delete[] Asplit[i];
}

return 0;
}
179 changes: 152 additions & 27 deletions apps/choleskey/matrixutil.hpp
Original file line number Diff line number Diff line change
@@ -1,41 +1,166 @@
#pragma once

#include <iostream>
#include <vector>
#include <cblas.h>
#include <lapacke.h>
#include <stddef.h>
#include <stdlib.h>
#include <cstdlib>
#include "argparse/argparse.hpp"
#include "commons.hpp"

// generate positive definition matrix
template <typename T>
using Matrix = std::vector<std::vector<T>>;

template <typename T>
std::vector<T> generate_pascal_matrix(const int n) {
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));

for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
if (i == 0 || j == 0) {
matrix[i][j] = static_cast<T>(1);
} else {
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
}
}
}

std::vector<T> flattenedVector;
for (const auto& row : matrix) {
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
}
return std::move(flattenedVector);
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));

for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
if (i == 0 || j == 0) {
matrix[i][j] = static_cast<T>(1);
} else {
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
}
}
}

std::vector<T> flattenedVector;
for (const auto& row : matrix) {
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
}
return std::move(flattenedVector);
}

// parameters define
struct args_params_t : public argparse::Args {
bool& results = kwarg("results", "print generated results (default: false)")
.set_default(true);
std::uint64_t& nd =
kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
.set_default(10);
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
std::uint64_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10);
std::uint64_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2);
int& verifycorrectness =
kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(1);
#if defined(CHOLESKY_OMP)
int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
#endif // CHOLESKY_OMP
bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
};

// help functions for tiled_cholesky
double* generate_positiveDefinitionMatrix(const size_t matrix_size) {
double* A_matrix = new double[matrix_size * matrix_size];
double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double));
unsigned int seeds = matrix_size;

// generate a random symmetric matrix
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = row; col < matrix_size; ++col) {
A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX;
}
}
// compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);

// Adjust Diagonals
for (size_t row = 0; row < matrix_size; ++row) {
double diagonals = 1.0; // from 1.0
for (size_t col = 0; col < matrix_size; ++col) {
diagonals += pd_matrix[row * matrix_size + col];
}
// Set the diag entry
pd_matrix[row * matrix_size + row] = diagonals;
}

delete[] A_matrix;
return pd_matrix;
}

void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size,
const int size, bool layRow) {

int total_num_tiles = num_tiles * num_tiles;
int offset_tile;

//#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
if (layRow) {
offset_tile =
int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size;
} else {
offset_tile =
int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size;
}

for (int i = 0; i < tile_size; ++i)
//#pragma simd
for (int j = 0; j < tile_size; ++j) {
matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j];
}
}
}

double* assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size,
bool layRow) {
int i_tile, j_tile, tile, i_local, j_local;
//#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
schedule(auto)
for (int i = 0; i < size; ++i) {
i_local = int(i % tile_size);
i_tile = int(i / tile_size);
//#pragma simd private(j_tile, tile, j_local)
for (int j = 0; j < size; ++j) {
j_tile = int(j / tile_size);
if (layRow) {
tile = i_tile * num_tiles + j_tile;
} else {
tile = j_tile * num_tiles + i_tile;
}
j_local = int(j % tile_size);
matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
}
}
return matrix;
}

bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) {
bool res = true;
double diff;
for (int i = 0; i < totalsize; ++i) {
diff = dporft_res[i] - lower_res[i];
if (fabs(dporft_res[i]) > 1e-5) {
diff /= dporft_res[i];
}
diff = fabs(diff);
if (diff > 1.0e-5) {
fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
res = false;
break;
}
}
return res;
}

void printLowerResults(const double* matrix, size_t matrix_size) {
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = 0; col <= row; ++col) {
fmt::print("{}\t", matrix[row * matrix_size + col]);
}
fmt::print("\n");
}
}

void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) {
for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
fmt::print("Block {}:\n", itile);
for (int i = 0; i < tile_size; ++i) {
for (int j = 0; j < tile_size; ++j) {
fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
}
fmt::print("\n");
}
fmt::print("\n");
}
}

0 comments on commit f18b4a7

Please sign in to comment.