-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
326 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
/* | ||
Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks. | ||
references: | ||
>Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53. | ||
>Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.) | ||
Additionally, include openblas library when build: | ||
or $ export OPENBLAS_DIR=/openblas/path | ||
*/ | ||
#define CHOLESKY_OMP | ||
|
||
#include <bits/stdc++.h> | ||
#include <omp.h> | ||
#include <cmath> | ||
#include <cstring> | ||
#include "matrixutil.hpp" | ||
|
||
void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, const CBLAS_ORDER blasLay, | ||
const int lapackLay) { | ||
unsigned int m = 0, n = 0, k = 0; | ||
|
||
for (k = 0; k < num_tiles; ++k) { //POTRF | ||
#pragma omp task depend(inout : matrix_split[k * num_tiles + k]) | ||
{ int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size); } | ||
|
||
for (m = k + 1; m < num_tiles; ++m) { //DTRSM | ||
#pragma omp task depend(in : matrix_split[k * num_tiles + k]) depend(inout : matrix_split[m * num_tiles + k]) | ||
{ | ||
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, | ||
matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size); | ||
} | ||
} | ||
|
||
for (n = k + 1; n < num_tiles; ++n) { //DSYRK | ||
#pragma omp task depend(in : matrix_split[n * num_tiles + k]) depend(inout : matrix_split[n * num_tiles + n]) | ||
{ | ||
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, | ||
matrix_split[n * num_tiles + k], tile_size, 1.0, matrix_split[n * num_tiles + n], | ||
tile_size); | ||
} | ||
|
||
for (m = n + 1; m < num_tiles; ++m) { //DGEMM | ||
#pragma omp task depend(in : matrix_split[m * num_tiles + k], matrix_split[n * num_tiles + k]) \ | ||
depend(inout : matrix_split[m * num_tiles + n]) | ||
{ | ||
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0, | ||
matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size, | ||
1.0, matrix_split[m * num_tiles + n], tile_size); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
// parse params | ||
args_params_t args = argparse::parse<args_params_t>(argc, argv); | ||
int tile_size = 0, tot_tiles = 0; | ||
|
||
std::uint64_t matrix_size = args.mat_size; // Number of matrix dimension. | ||
std::uint64_t num_tiles = args.num_tiles; // matrix size MUST be divisible | ||
int verifycorrectness = args.verifycorrectness; | ||
int nthreads = args.nthreads; | ||
|
||
bool layRow = true; | ||
|
||
fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles); | ||
|
||
// Check mat_size is divisible by num_tiles | ||
if (matrix_size % num_tiles != 0) { | ||
fmt::print("matrix size must be divisible by num_tiles.. aborting\n"); | ||
throw std::invalid_argument("Matrix size is not divisible by num_tiles"); | ||
} | ||
|
||
if (matrix_size == 0) { | ||
fmt::print(" 0 illegal input matrix_size \n"); | ||
std::exit(0); | ||
} | ||
|
||
tile_size = matrix_size / num_tiles; | ||
tot_tiles = num_tiles * num_tiles; | ||
|
||
double* A = new double[matrix_size * matrix_size]; | ||
|
||
// Allocate memory for tiled_cholesky for the full matrix | ||
double* A_lower = new double[matrix_size * matrix_size]; | ||
|
||
// Allocate memory for MKL cholesky for the full matrix | ||
double* A_MKL = new double[matrix_size * matrix_size]; | ||
|
||
// Memory allocation for tiled matrix | ||
double** Asplit = new double*[tot_tiles]; | ||
|
||
for (int i = 0; i < tot_tiles; ++i) { | ||
// Buffer per tile | ||
Asplit[i] = new double[tile_size * tile_size]; | ||
} | ||
|
||
//Generate a symmetric positve-definite matrix | ||
A = generate_positiveDefinitionMatrix(matrix_size); | ||
|
||
//printMatrix(A, mat_size_m); | ||
|
||
CBLAS_ORDER blasLay; | ||
int lapackLay; | ||
|
||
if (layRow) { | ||
blasLay = CblasRowMajor; | ||
lapackLay = LAPACK_ROW_MAJOR; | ||
} else { | ||
blasLay = CblasColMajor; | ||
lapackLay = LAPACK_COL_MAJOR; | ||
} | ||
|
||
//copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL) | ||
std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double)); | ||
std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double)); | ||
|
||
//splits the input matrix into tiles | ||
split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow); | ||
|
||
// Measure execution time. | ||
Timer timer; | ||
//run the tiled Cholesky function | ||
#pragma omp parallel num_threads(nthreads) | ||
{ | ||
#pragma omp master | ||
{ tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay); } | ||
} | ||
|
||
//assembling seperated tiles back into full matrix | ||
assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow); | ||
auto time = timer.stop(); | ||
if (args.time) { | ||
fmt::print("Duration: {:f} ms\n", time); | ||
} | ||
|
||
//calling LAPACKE_dpotrf cholesky for verification and timing comparison | ||
int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size); | ||
|
||
if (verifycorrectness == 1) { | ||
bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size); | ||
if (res) { | ||
fmt::print("Tiled Cholesky decomposition successful\n"); | ||
} else { | ||
fmt::print("Tiled Cholesky decomposition failed\n"); | ||
} | ||
fmt::print("\n"); | ||
} | ||
|
||
// print lower_matrix if tiled_cholesky sucessfull | ||
if (args.lower_matrix) { | ||
fmt::print("The lower matrix of input matrix after tiled_cholesky: \n"); | ||
printLowerResults(A_lower, matrix_size); | ||
} | ||
|
||
//memory free | ||
delete[] A; | ||
delete[] A_lower; | ||
delete[] A_MKL; | ||
for (int i = 0; i < tot_tiles; ++i) { | ||
delete[] Asplit[i]; | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,166 @@ | ||
#pragma once | ||
|
||
#include <iostream> | ||
#include <vector> | ||
#include <cblas.h> | ||
#include <lapacke.h> | ||
#include <stddef.h> | ||
#include <stdlib.h> | ||
#include <cstdlib> | ||
#include "argparse/argparse.hpp" | ||
#include "commons.hpp" | ||
|
||
// generate positive definition matrix | ||
template <typename T> | ||
using Matrix = std::vector<std::vector<T>>; | ||
|
||
template <typename T> | ||
std::vector<T> generate_pascal_matrix(const int n) { | ||
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0))); | ||
|
||
for (int i = 0; i < n; ++i) { | ||
for (int j = 0; j < n; ++j) { | ||
if (i == 0 || j == 0) { | ||
matrix[i][j] = static_cast<T>(1); | ||
} else { | ||
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; | ||
} | ||
} | ||
} | ||
|
||
std::vector<T> flattenedVector; | ||
for (const auto& row : matrix) { | ||
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); | ||
} | ||
return std::move(flattenedVector); | ||
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0))); | ||
|
||
for (int i = 0; i < n; ++i) { | ||
for (int j = 0; j < n; ++j) { | ||
if (i == 0 || j == 0) { | ||
matrix[i][j] = static_cast<T>(1); | ||
} else { | ||
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; | ||
} | ||
} | ||
} | ||
|
||
std::vector<T> flattenedVector; | ||
for (const auto& row : matrix) { | ||
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); | ||
} | ||
return std::move(flattenedVector); | ||
} | ||
|
||
// parameters define | ||
struct args_params_t : public argparse::Args { | ||
bool& results = kwarg("results", "print generated results (default: false)") | ||
.set_default(true); | ||
std::uint64_t& nd = | ||
kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)") | ||
.set_default(10); | ||
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); | ||
bool& help = flag("h, help", "print help"); | ||
bool& time = kwarg("t, time", "print time").set_default(true); | ||
std::uint64_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10); | ||
std::uint64_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2); | ||
int& verifycorrectness = | ||
kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(1); | ||
#if defined(CHOLESKY_OMP) | ||
int& nthreads = kwarg("nthreads", "number of threads").set_default(1); | ||
#endif // CHOLESKY_OMP | ||
bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true); | ||
bool& help = flag("h, help", "print help"); | ||
bool& time = kwarg("t, time", "print time").set_default(true); | ||
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); | ||
}; | ||
|
||
// help functions for tiled_cholesky | ||
double* generate_positiveDefinitionMatrix(const size_t matrix_size) { | ||
double* A_matrix = new double[matrix_size * matrix_size]; | ||
double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double)); | ||
unsigned int seeds = matrix_size; | ||
|
||
// generate a random symmetric matrix | ||
for (size_t row = 0; row < matrix_size; ++row) { | ||
for (size_t col = row; col < matrix_size; ++col) { | ||
A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX; | ||
} | ||
} | ||
// compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix. | ||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix, | ||
matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size); | ||
|
||
// Adjust Diagonals | ||
for (size_t row = 0; row < matrix_size; ++row) { | ||
double diagonals = 1.0; // from 1.0 | ||
for (size_t col = 0; col < matrix_size; ++col) { | ||
diagonals += pd_matrix[row * matrix_size + col]; | ||
} | ||
// Set the diag entry | ||
pd_matrix[row * matrix_size + row] = diagonals; | ||
} | ||
|
||
delete[] A_matrix; | ||
return pd_matrix; | ||
} | ||
|
||
void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size, | ||
const int size, bool layRow) { | ||
|
||
int total_num_tiles = num_tiles * num_tiles; | ||
int offset_tile; | ||
|
||
//#pragma omp parallel for private(i, j, offset_tile) schedule(auto) | ||
for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) { | ||
if (layRow) { | ||
offset_tile = | ||
int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size; | ||
} else { | ||
offset_tile = | ||
int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size; | ||
} | ||
|
||
for (int i = 0; i < tile_size; ++i) | ||
//#pragma simd | ||
for (int j = 0; j < tile_size; ++j) { | ||
matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j]; | ||
} | ||
} | ||
} | ||
|
||
double* assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size, | ||
bool layRow) { | ||
int i_tile, j_tile, tile, i_local, j_local; | ||
//#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \ | ||
schedule(auto) | ||
for (int i = 0; i < size; ++i) { | ||
i_local = int(i % tile_size); | ||
i_tile = int(i / tile_size); | ||
//#pragma simd private(j_tile, tile, j_local) | ||
for (int j = 0; j < size; ++j) { | ||
j_tile = int(j / tile_size); | ||
if (layRow) { | ||
tile = i_tile * num_tiles + j_tile; | ||
} else { | ||
tile = j_tile * num_tiles + i_tile; | ||
} | ||
j_local = int(j % tile_size); | ||
matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local]; | ||
} | ||
} | ||
return matrix; | ||
} | ||
|
||
bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) { | ||
bool res = true; | ||
double diff; | ||
for (int i = 0; i < totalsize; ++i) { | ||
diff = dporft_res[i] - lower_res[i]; | ||
if (fabs(dporft_res[i]) > 1e-5) { | ||
diff /= dporft_res[i]; | ||
} | ||
diff = fabs(diff); | ||
if (diff > 1.0e-5) { | ||
fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]); | ||
res = false; | ||
break; | ||
} | ||
} | ||
return res; | ||
} | ||
|
||
void printLowerResults(const double* matrix, size_t matrix_size) { | ||
for (size_t row = 0; row < matrix_size; ++row) { | ||
for (size_t col = 0; col <= row; ++col) { | ||
fmt::print("{}\t", matrix[row * matrix_size + col]); | ||
} | ||
fmt::print("\n"); | ||
} | ||
} | ||
|
||
void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) { | ||
for (int itile = 0; itile < num_tiles * num_tiles; ++itile) { | ||
fmt::print("Block {}:\n", itile); | ||
for (int i = 0; i < tile_size; ++i) { | ||
for (int j = 0; j < tile_size; ++j) { | ||
fmt::print("{} ", matrix_split[itile][i * tile_size + j]); | ||
} | ||
fmt::print("\n"); | ||
} | ||
fmt::print("\n"); | ||
} | ||
} |