Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tiled_cholesky w/o openmp #27

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apps/choleskey/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ target_include_directories(
# choleskey_stdpar_snd
# PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
# ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

#tiled cholesky
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
add_executable(cholesky_tiled cholesky_tiled.cpp)
target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
target_include_directories(
cholesky_tiled
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
161 changes: 161 additions & 0 deletions apps/choleskey/cholesky_tiled.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
1. Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
references:
>Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
>Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)

2. Therefore, the first step is to implement tiled Cholesky decomposition algorithm.

3. This file is to implement tiled Cholesky decomposition algorithm.
reference the implementation from Intel open source project, hetero-streams which will no longer be maintained by Intel.
https://github.com/intel/hetero-streams/tree/master/ref_code/cholesky

4. Additionally, include openblas library when build:
Copy link
Collaborator

@mhaseeb123 mhaseeb123 Dec 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to install/setup OpenBLAS with CPM during CMake? Personally I don't want to add these extra manual steps of installing and export OPENBLAS_DIR=.. as they would make general usage as well as CI/CD really tedious. @weilewei

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mhaseeb123 agreed not introducing extra flags. I just tried with CPMadd, and building openblas takes a while for the first-time building it.
@hcq9102 try the following in the CMakeLists files. Additionally, I request to add a compilation flag something like USE_OPENBLAS to cmake build to separate out if we should use cpm to add openblas.

in main CmakeLists.txt

if (USE_OPENBLAS)
cpmaddpackage(NAME openblas GITHUB_REPOSITORY OpenMathLib/OpenBLAS GIT_TAG
              v0.3.25)
endif()

in your application CMakeLists.txt:

target_link_libraries(myapp openblas)

or $ export OPENBLAS_DIR=/openblas/path
*/

#include <bits/stdc++.h>
#include <cblas.h>
#include <lapacke.h>
#include <omp.h>
#include <cmath>
#include <cstring>
#include "argparse/argparse.hpp"
#include "matrixutil.hpp"

void tiled_cholesky(data_type* matrix_split[], const int tile_size, const int num_tiles, CBLAS_ORDER blasLay,
const int lapackLay) {
unsigned int m = 0, n = 0, k = 0;

for (k = 0; k < num_tiles; ++k) {
//POTRF
int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size);

for (m = k + 1; m < num_tiles; ++m) {
//DTRSM
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
}

for (n = k + 1; n < num_tiles; ++n) {
//DSYRK
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, matrix_split[n * num_tiles + k],
tile_size, 1.0, matrix_split[n * num_tiles + n], tile_size);

for (m = n + 1; m < num_tiles; ++m) {
//DGEMM
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size, 1.0,
matrix_split[m * num_tiles + n], tile_size);
}
}
}
}

int main(int argc, char** argv) {

// parse params
args_params_t args = argparse::parse<args_params_t>(argc, argv);
int tile_size = 0, tot_tiles = 0;

std::uint64_t matrix_size = args.mat_size; // Number of matrix dimension.
std::uint64_t num_tiles = args.num_tiles; // matrix size MUST be divisible
int verifycorrectness = args.verifycorrectness;

bool layRow = true;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);

// Check mat_size is divisible by num_tiles
if (matrix_size % num_tiles != 0) {
fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
throw std::invalid_argument("Matrix size is not divisible by num_tiles");
}

if (matrix_size == 0) {
fmt::print(" 0 illegal input matrix_size \n");
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
std::exit(0);
}

tile_size = matrix_size / num_tiles;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
tot_tiles = num_tiles * num_tiles;

data_type* A = new data_type[matrix_size * matrix_size];

// Allocate memory for tiled_cholesky for the full matrix
data_type* A_lower = new data_type[matrix_size * matrix_size];

// Allocate memory for MKL cholesky for the full matrix
data_type* A_MKL = new data_type[matrix_size * matrix_size];

// Memory allocation for tiled matrix
data_type** Asplit = new data_type*[tot_tiles];

for (int i = 0; i < tot_tiles; ++i) {
// Buffer per tile
Asplit[i] = new data_type[tile_size * tile_size];
}

//Generate a symmetric positve-definite matrix
A = generate_positiveDefinitionMatrix(matrix_size);

CBLAS_ORDER blasLay;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
int lapackLay;

if (layRow) {
blasLay = CblasRowMajor;
lapackLay = LAPACK_ROW_MAJOR;
} else {
blasLay = CblasColMajor;
lapackLay = LAPACK_COL_MAJOR;
}

//copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL)
std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(data_type));
std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(data_type));
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

//splits the input matrix into tiles
split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow);

// Measure execution time.
Timer timer;
//run the tiled Cholesky function
tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay);

//assembling seperated tiles back into full matrix
assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow);
auto time = timer.stop();

if (args.time) {
fmt::print("Duration: {:f} ms\n", time);
}

//calling LAPACKE_dpotrf cholesky for verification and timing comparison
int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);

if (verifycorrectness == 1) {
bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size);
if (res) {
fmt::print("Tiled Cholesky decomposition successful\n");
} else {
fmt::print("Tiled Cholesky decomposition failed\n");
}
fmt::print("\n");
}

// print lower_matrix if tiled_cholesky sucessfull
if (args.lower_matrix) {
fmt::print("The lower matrix of input matrix after tiled_cholesky: \n");
printLowerResults(A_lower, matrix_size);
}

//memory free
delete[] A;
delete[] A_lower;
delete[] A_MKL;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, I wonder if we can do it thorough unique_ptr instead of new and delete. not important for this PR though.

for (int i = 0; i < tot_tiles; ++i) {
delete[] Asplit[i];
}

return 0;
}
176 changes: 149 additions & 27 deletions apps/choleskey/matrixutil.hpp
Original file line number Diff line number Diff line change
@@ -1,41 +1,163 @@
#pragma once

#include <iostream>
#include <vector>
#include <cblas.h>
#include <lapacke.h>
#include <stddef.h>
#include <stdlib.h>
#include <cstdlib>
#include "commons.hpp"

using data_type = double;

// generate positive definition matrix
template <typename T>
using Matrix = std::vector<std::vector<T>>;

template <typename T>
std::vector<T> generate_pascal_matrix(const int n) {
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));

for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
if (i == 0 || j == 0) {
matrix[i][j] = static_cast<T>(1);
} else {
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
}
}
}

std::vector<T> flattenedVector;
for (const auto& row : matrix) {
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
}
return std::move(flattenedVector);
Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need static_cast here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch! for basic numeric types, we can skip using static_cast(0) for initialization to zero as it's redundant.
this is used in previous code, which I may discard it in the future when I clean up the code though.
I wont change it in this pr. But i mark it down for future clean up.


for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
if (i == 0 || j == 0) {
matrix[i][j] = static_cast<T>(1);
} else {
matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
}
}
}

std::vector<T> flattenedVector;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please change this std::vector<T> flattenedVector; to std::vector<T> flattenedVector(N); where N is your estimated biggest size of the flattened array size, so you can pre-allocate needed size.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in this way, then you may not use insert in the following, maybe need to use flattenedVector[i] = ...

Copy link
Contributor Author

@hcq9102 hcq9102 Dec 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. As its not related with this PR, but useful for future clean up of previous code. I write it down. thanks.

for (const auto& row : matrix) {
flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
}
return std::move(flattenedVector);
}

// parameters define
struct args_params_t : public argparse::Args {
bool& results = kwarg("results", "print generated results (default: false)")
.set_default(true);
std::uint64_t& nd =
kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
.set_default(10);
std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
std::uint64_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10);
std::uint64_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2);
int& verifycorrectness =
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(1);
bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true);
bool& help = flag("h, help", "print help");
bool& time = kwarg("t, time", "print time").set_default(true);
};

// help functions for tiled_cholesky
data_type* generate_positiveDefinitionMatrix(const size_t matrix_size) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so the data type of matrix_size or length of input are inconsistent across your implementations. In some places, you use int, some unsigned int, and some size_t. please do a sweep across all your files, and make them using size_t or whatever that makes sense.

data_type* A_matrix = new data_type[matrix_size * matrix_size];
data_type* pd_matrix = (data_type*)malloc(matrix_size * matrix_size * sizeof(data_type));
unsigned int seeds = matrix_size;

// generate a random symmetric matrix
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = row; col < matrix_size; ++col) {
A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] =
(data_type)rand_r(&seeds) / RAND_MAX;
}
}
// compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);

// Adjust Diagonals
for (size_t row = 0; row < matrix_size; ++row) {
double diagonals = 1.0; // from 1.0
for (size_t col = 0; col < matrix_size; ++col) {
diagonals += pd_matrix[row * matrix_size + col];
}
// Set the diag entry
pd_matrix[row * matrix_size + row] = diagonals;
}

delete[] A_matrix;
return pd_matrix;
}

void split_into_tiles(const data_type* matrix, data_type* matrix_split[], const int num_tiles, const int tile_size,
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
const int size, bool layRow) {
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

int total_num_tiles = num_tiles * num_tiles;
int offset_tile;

//#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
if (layRow) {
offset_tile =
int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size;
} else {
offset_tile =
int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size;
}

for (int i = 0; i < tile_size; ++i)
//#pragma simd
for (int j = 0; j < tile_size; ++j) {
matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j];
}
}
}

void assemble_tiles(data_type* matrix_split[], data_type* matrix, const int num_tiles, const int tile_size,
const int size, bool layRow) {
int i_tile, j_tile, tile, i_local, j_local;
//#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
schedule(auto)
for (int i = 0; i < size; ++i) {
i_local = int(i % tile_size);
i_tile = int(i / tile_size);
//#pragma simd private(j_tile, tile, j_local)
for (int j = 0; j < size; ++j) {
j_tile = int(j / tile_size);
if (layRow) {
tile = i_tile * num_tiles + j_tile;
} else {
tile = j_tile * num_tiles + i_tile;
}
j_local = int(j % tile_size);
matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
}
}
}

bool verify_results(const data_type* lower_res, const data_type* dporft_res, const int totalsize) {
bool res = true;
data_type diff;
for (int i = 0; i < totalsize; ++i) {
diff = dporft_res[i] - lower_res[i];
if (fabs(dporft_res[i]) > 1e-5) {
diff /= dporft_res[i];
}
diff = fabs(diff);
if (diff > 1.0e-5) {
fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
res = false;
break;
}
}
return res;
}

void printLowerResults(const data_type* matrix, size_t matrix_size) {
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = 0; col <= row; ++col) {
fmt::print("{}\t", matrix[row * matrix_size + col]);
}
fmt::print("\n");
}
}

void print_mat_split(data_type* matrix_split[], int num_tiles, int tile_size) {
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
fmt::print("Block {}:\n", itile);
for (int i = 0; i < tile_size; ++i) {
for (int j = 0; j < tile_size; ++j) {
fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
}
fmt::print("\n");
}
fmt::print("\n");
}
}