Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tiled_cholesky w/o openmp #27

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions apps/choleskey/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,13 @@ target_include_directories(
# choleskey_stdpar_snd
# PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
# ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

#tiled cholesky
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
include_directories(/global/homes/c/chuanqiu/newrepo/locallib/openblasinstall/include/openblas)
link_directories(/global/homes/c/chuanqiu/newrepo/locallib/openblasinstall/lib64/cmake/OpenBLAS)
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
add_executable(cholesky_tiled cholesky_tiled.cpp)
target_link_libraries(cholesky_tiled PRIVATE blas lapack -fortranlibs gfortran)
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
target_include_directories(
cholesky_tiled
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
192 changes: 192 additions & 0 deletions apps/choleskey/cholesky_tiled.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#include <bits/stdc++.h>
#include <cblas.h>
#include <lapacke.h>
#include <omp.h>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <experimental/linalg>
#include <iostream>
#include <utility>
#include <vector>
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
#include "help.hpp"

using data_type = double;

hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
//#include "time.hpp"
#define SWITCH_CHAR '-'
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

double* dpo_generate(size_t side_size) {
unsigned int seed = side_size;
#ifdef _WIN32
srand(seed);
#endif
// M is a (very) pseudo-random symmetric matrix
double* M_matrix = new double[side_size * side_size];
for (size_t row = 0; row < side_size; ++row) {
for (size_t col = row; col < side_size; ++col) {
M_matrix[col * side_size + row] = M_matrix[row * side_size + col] = (double)
#ifdef _WIN32
rand()
#else
rand_r(&seed)
#endif
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
/ RAND_MAX;
}
}
double* ret_matrix = (double*)malloc(side_size * side_size * sizeof(double));
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, side_size, side_size, side_size, 1.0, M_matrix, side_size,
M_matrix, side_size, 0.0, ret_matrix, side_size);

//adjust diagonals (diag = sum (row entries) + 1.0)
for (size_t row = 0; row < side_size; ++row) {
double diag = 1.0; //start from 1.0
for (size_t col = 0; col < side_size; ++col) {
diag += ret_matrix[row * side_size + col];
}
//set the diag entry
ret_matrix[row * side_size + row] = diag;
}

delete[] M_matrix;
return ret_matrix;
}

void tiled_cholesky(double* mat_sp[], int tile_size, int num_tiles, CBLAS_ORDER blasLay, int lapackLay) {
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
unsigned int m, n, k;
int info;

for (k = 0; k < num_tiles; ++k) {

//POTRF - MKL call
info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, mat_sp[k * num_tiles + k], tile_size);
std::cout << "info = " << info << std::endl;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

for (m = k + 1; m < num_tiles; ++m) {

//DTRSM - MKL call
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
mat_sp[k * num_tiles + k], tile_size, mat_sp[m * num_tiles + k], tile_size);
}

for (n = k + 1; n < num_tiles; ++n) {
//DSYRK - MKL call
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, mat_sp[n * num_tiles + k],
tile_size, 1.0, mat_sp[n * num_tiles + n], tile_size);

for (m = n + 1; m < num_tiles; ++m) {

//DGEMM - MKL call
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
mat_sp[m * num_tiles + k], tile_size, mat_sp[n * num_tiles + k], tile_size, 1.0,
mat_sp[m * num_tiles + n], tile_size);
}
}
}
}

int main(int argc, char** argv) {
int info;

int mat_size_m, num_tiles, tile_size, tot_tiles;
mat_size_m = 4; // must be an input
num_tiles = 2; // matrix size MUST be divisible

bool layRow = true;
int verify = 1;

std::cout << "mat_size = " << mat_size_m << ", num_tiles = " << num_tiles << std::endl << std::endl;

// Check that mat_size is divisible by num_tiles
if (mat_size_m % num_tiles != 0) {
std::cout << "matrix size MUST be divisible by num_tiles.. aborting" << std::endl;
throw std::invalid_argument("Matrix size is not divisible by num_tiles");
}

if (mat_size_m == 0) {
printf("mat_size_m is not defined\n");
exit(0);
}

tile_size = mat_size_m / num_tiles;
tot_tiles = num_tiles * num_tiles;

//allocating memory for input matrix (full matrix)
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
double* A = (double*)malloc(mat_size_m * mat_size_m * sizeof(double));

//allocating memory for tiled_cholesky for the full matrix
double* A_lower = (double*)malloc(mat_size_m * mat_size_m * sizeof(double));

//allocating memory for MKL cholesky for the full matrix
double* A_MKL = (double*)malloc(mat_size_m * mat_size_m * sizeof(double));

//memory allocation for tiled matrix
double** Asplit = new double*[tot_tiles];

for (int i = 0; i < tot_tiles; ++i) {
//Buffer per tile
Asplit[i] = (double*)malloc(tile_size * tile_size * sizeof(double));
}

//Generate a symmetric positve-definite matrix
A = dpo_generate(mat_size_m);

printMatrix(A, mat_size_m);

CBLAS_ORDER blasLay;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
int lapackLay;

if (layRow) {
blasLay = CblasRowMajor;
lapackLay = LAPACK_ROW_MAJOR;
} else {
blasLay = CblasColMajor;
lapackLay = LAPACK_COL_MAJOR;
}

//copying matrices into separate variables for tiled cholesky (A_my)
//and MKL cholesky (A_MKL)
//The output overwrites the matrices
std::cout << "A_my = " << std::endl;
copy_mat(A, A_lower, mat_size_m);
copy_mat(A, A_MKL, mat_size_m);
printMatrix(A_lower, mat_size_m);

//This splits the input matrix into tiles (or blocks)
split_into_blocks(A_lower, Asplit, num_tiles, tile_size, mat_size_m, layRow);
std::cout << "split matrix: " << std::endl;
print_mat_split(Asplit, num_tiles, tile_size);

//Calling the tiled Cholesky function. This does the factorization of the full matrix using a tiled implementation.
tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay);

//assembling of tiles back into full matrix
assemble(Asplit, A_lower, num_tiles, tile_size, mat_size_m, layRow);
std::cout << "Cholosky decomposition: lower matrix_A_lower \n";
printMatrix(A_lower, mat_size_m);
//tbegin = dtimeGet();

//calling mkl cholesky for verification and timing comparison
info = LAPACKE_dpotrf(lapackLay, 'L', mat_size_m, A_MKL, mat_size_m);
std::cout << "A_MKL \n";
printMatrix(A_MKL, mat_size_m);

if (verify == 1) {
bool res = verify_results(A_lower, A_MKL, mat_size_m * mat_size_m);
if (res == true) {
printf("Tiled Cholesky successful\n");
} else {
printf("Tiled Chloesky failed\n");
}
}

//free
free(A);
free(A_lower);
free(A_MKL);
for (int i = 0; i < tot_tiles; ++i) {
free(Asplit[i]);
}

return 0;
}
138 changes: 138 additions & 0 deletions apps/choleskey/help.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#pragma once

hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
#include <cblas.h>
#include <lapacke.h>
#include <stddef.h>
#include <stdlib.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>

// Function to read data from a text file and store it in a vector
template <typename T>
inline std::vector<T> readDataFromFile(const std::string& filename) {
std::vector<T> data;

// Open the file
std::ifstream file(filename);

// Check if the file is open successfully
if (!file.is_open()) {
std::cerr << "Failed to open the file: " << filename << std::endl;
return data; // Return an empty vector in case of failure
}

std::string line;
while (std::getline(file, line)) {
// Use std::istringstream to parse each line into doubles and store them in
// the vector
double value;
std::istringstream iss(line);
while (iss >> value) {
data.push_back(value);
}
}

// Close the file
file.close();

return data;
}

void split_into_blocks(double* mat, double* mat_split[], int num_tiles,
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
int tile_size, int size, bool layRow) {
int itile, i, j, offset_tile;

int tot_tiles = num_tiles * num_tiles;

//#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
for (itile = 0; itile < tot_tiles; ++itile) {
if (layRow) {
offset_tile = int(itile / num_tiles) * num_tiles * tile_size * tile_size +
int(itile % num_tiles) * tile_size;
} else {
offset_tile = int(itile % num_tiles) * num_tiles * tile_size * tile_size +
int(itile / num_tiles) * tile_size;
}

for (i = 0; i < tile_size; ++i)
//#pragma simd
for (j = 0; j < tile_size; ++j) {
mat_split[itile][i * tile_size + j] = mat[offset_tile + i * size + j];
}
}
}

void assemble(double* mat_split[], double* mat, int num_tiles, int tile_size,
int size, bool layRow) {
int i_tile, j_tile, tile, i, j, i_local, j_local;
//#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
schedule(auto)
for (i = 0; i < size; ++i) {
i_local = int(i % tile_size);
i_tile = int(i / tile_size);
//#pragma simd private(j_tile, tile, j_local)
for (j = 0; j < size; ++j) {
j_tile = int(j / tile_size);
if (layRow) {
tile = i_tile * num_tiles + j_tile;
} else {
tile = j_tile * num_tiles + i_tile;
}
j_local = int(j % tile_size);
mat[i * size + j] = mat_split[tile][i_local * tile_size + j_local];
}
}
}

void copy_mat(double* A, double* B, int size) {
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
int i, j;
for (i = 0; i < size; ++i)
for (j = 0; j < size; ++j) {
B[i * size + j] = A[i * size + j];
}
}

bool verify_results(double* act, double* ref, int totalsize) {
double diff;

bool res = true;
for (int i = 0; i < totalsize; ++i) {
diff = ref[i] - act[i];
if (fabs(ref[i]) > 1e-5) {
diff /= ref[i];
}
diff = fabs(diff);
if (diff > 1.0e-5) {
printf("\nError detected at i = %d: ref %g actual %g\n", i, ref[i],
act[i]);
res = false;
break;
}
}
return res;
}

void printMatrix(const double* matrix, size_t side_size) {
for (size_t row = 0; row < side_size; ++row) {
for (size_t col = 0; col <= row; ++col) {
std::cout << matrix[row * side_size + col] << "\t";
}
std::cout << std::endl;
}
}

void print_mat_split(double* mat_split[], int num_tiles, int tile_size) {
for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
printf("Block %d:\n", itile);
for (int i = 0; i < tile_size; ++i) {
for (int j = 0; j < tile_size; ++j) {
printf("%f ", mat_split[itile][i * tile_size + j]);
}
printf("\n");
}
printf("\n");
}
}
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved