From f18b4a7ce1ec54374ad4880e718e7102887f5381 Mon Sep 17 00:00:00 2001
From: hcq9102 <chuanqiuhe@gmail.com>
Date: Mon, 4 Dec 2023 14:45:29 -0800
Subject: [PATCH] cholesky_tiled_openmp implement

---
 apps/choleskey/CMakeLists.txt         |   8 ++
 apps/choleskey/cholesky_tiled_omp.cpp | 166 ++++++++++++++++++++++++
 apps/choleskey/matrixutil.hpp         | 179 ++++++++++++++++++++++----
 3 files changed, 326 insertions(+), 27 deletions(-)
 create mode 100644 apps/choleskey/cholesky_tiled_omp.cpp
diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
index fb87553..a78fe9d 100644
--- a/apps/choleskey/CMakeLists.txt
+++ b/apps/choleskey/CMakeLists.txt
@@ -21,3 +21,11 @@ target_include_directories(
 #   choleskey_stdpar_snd
 #   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
 #           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+#tiled cholesky
+add_executable(cholesky_tiled cholesky_tiled.cpp)
+target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
+target_include_directories(
+  cholesky_tiled
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
diff --git a/apps/choleskey/cholesky_tiled_omp.cpp b/apps/choleskey/cholesky_tiled_omp.cpp
new file mode 100644
index 0000000..540edaa
--- /dev/null
+++ b/apps/choleskey/cholesky_tiled_omp.cpp
@@ -0,0 +1,166 @@
+/*
+Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
+    references:
+    >Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
+    >Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)
+
+Additionally, include openblas library when build:
+   or $ export OPENBLAS_DIR=/openblas/path
+*/
+#define CHOLESKY_OMP
+
+#include <bits/stdc++.h>
+#include <omp.h>
+#include <cmath>
+#include <cstring>
+#include "matrixutil.hpp"
+
+void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, const CBLAS_ORDER blasLay,
+                    const int lapackLay) {
+    unsigned int m = 0, n = 0, k = 0;
+
+    for (k = 0; k < num_tiles; ++k) {  //POTRF
+#pragma omp task depend(inout : matrix_split[k * num_tiles + k])
+        { int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size); }
+
+        for (m = k + 1; m < num_tiles; ++m) {  //DTRSM
+#pragma omp task depend(in : matrix_split[k * num_tiles + k]) depend(inout : matrix_split[m * num_tiles + k])
+            {
+                cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
+                            matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
+            }
+        }
+
+        for (n = k + 1; n < num_tiles; ++n) {  //DSYRK
+#pragma omp task depend(in : matrix_split[n * num_tiles + k]) depend(inout : matrix_split[n * num_tiles + n])
+            {
+                cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0,
+                            matrix_split[n * num_tiles + k], tile_size, 1.0, matrix_split[n * num_tiles + n],
+                            tile_size);
+            }
+
+            for (m = n + 1; m < num_tiles; ++m) {  //DGEMM
+#pragma omp task depend(in : matrix_split[m * num_tiles + k], matrix_split[n * num_tiles + k]) \
+    depend(inout : matrix_split[m * num_tiles + n])
+                {
+                    cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
+                                matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size,
+                                1.0, matrix_split[m * num_tiles + n], tile_size);
+                }
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    // parse params
+    args_params_t args = argparse::parse<args_params_t>(argc, argv);
+    int tile_size = 0, tot_tiles = 0;
+
+    std::uint64_t matrix_size = args.mat_size;  // Number of matrix dimension.
+    std::uint64_t num_tiles = args.num_tiles;   // matrix size MUST be divisible
+    int verifycorrectness = args.verifycorrectness;
+    int nthreads = args.nthreads;
+
+    bool layRow = true;
+
+    fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);
+
+    // Check mat_size is divisible by num_tiles
+    if (matrix_size % num_tiles != 0) {
+        fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
+        throw std::invalid_argument("Matrix size is not divisible by num_tiles");
+    }
+
+    if (matrix_size == 0) {
+        fmt::print(" 0 illegal input matrix_size \n");
+        std::exit(0);
+    }
+
+    tile_size = matrix_size / num_tiles;
+    tot_tiles = num_tiles * num_tiles;
+
+    double* A = new double[matrix_size * matrix_size];
+
+    // Allocate memory for tiled_cholesky for the full matrix
+    double* A_lower = new double[matrix_size * matrix_size];
+
+    // Allocate memory for MKL cholesky for the full matrix
+    double* A_MKL = new double[matrix_size * matrix_size];
+
+    // Memory allocation for tiled matrix
+    double** Asplit = new double*[tot_tiles];
+
+    for (int i = 0; i < tot_tiles; ++i) {
+        // Buffer per tile
+        Asplit[i] = new double[tile_size * tile_size];
+    }
+
+    //Generate a symmetric positve-definite matrix
+    A = generate_positiveDefinitionMatrix(matrix_size);
+
+    //printMatrix(A, mat_size_m);
+
+    CBLAS_ORDER blasLay;
+    int lapackLay;
+
+    if (layRow) {
+        blasLay = CblasRowMajor;
+        lapackLay = LAPACK_ROW_MAJOR;
+    } else {
+        blasLay = CblasColMajor;
+        lapackLay = LAPACK_COL_MAJOR;
+    }
+
+    //copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL)
+    std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double));
+    std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double));
+
+    //splits the input matrix into tiles
+    split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow);
+
+    // Measure execution time.
+    Timer timer;
+    //run the tiled Cholesky function
+#pragma omp parallel num_threads(nthreads)
+    {
+#pragma omp master
+        { tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay); }
+    }
+
+    //assembling seperated tiles back into full matrix
+    assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow);
+    auto time = timer.stop();
+    if (args.time) {
+        fmt::print("Duration: {:f} ms\n", time);
+    }
+
+    //calling LAPACKE_dpotrf cholesky for verification and timing comparison
+    int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);
+
+    if (verifycorrectness == 1) {
+        bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size);
+        if (res) {
+            fmt::print("Tiled Cholesky decomposition successful\n");
+        } else {
+            fmt::print("Tiled Cholesky decomposition failed\n");
+        }
+        fmt::print("\n");
+    }
+
+    // print lower_matrix if tiled_cholesky sucessfull
+    if (args.lower_matrix) {
+        fmt::print("The lower matrix of input matrix after tiled_cholesky: \n");
+        printLowerResults(A_lower, matrix_size);
+    }
+
+    //memory free
+    delete[] A;
+    delete[] A_lower;
+    delete[] A_MKL;
+    for (int i = 0; i < tot_tiles; ++i) {
+        delete[] Asplit[i];
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/apps/choleskey/matrixutil.hpp b/apps/choleskey/matrixutil.hpp
index 44f0468..5eada4d 100644
--- a/apps/choleskey/matrixutil.hpp
+++ b/apps/choleskey/matrixutil.hpp
@@ -1,7 +1,12 @@
 #pragma once
 
-#include <iostream>
-#include <vector>
+#include <cblas.h>
+#include <lapacke.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include "argparse/argparse.hpp"
+#include "commons.hpp"
 
 // generate positive definition matrix
 template <typename T>
@@ -9,33 +14,153 @@ using Matrix = std::vector<std::vector<T>>;
 
 template <typename T>
 std::vector<T> generate_pascal_matrix(const int n) {
-  Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
-
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < n; ++j) {
-      if (i == 0 || j == 0) {
-        matrix[i][j] = static_cast<T>(1);
-      } else {
-        matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
-      }
-    }
-  }
-
-  std::vector<T> flattenedVector;
-  for (const auto& row : matrix) {
-    flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
-  }
-  return std::move(flattenedVector);
+    Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
+
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (i == 0 || j == 0) {
+                matrix[i][j] = static_cast<T>(1);
+            } else {
+                matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
+            }
+        }
+    }
+
+    std::vector<T> flattenedVector;
+    for (const auto& row : matrix) {
+        flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
+    }
+    return std::move(flattenedVector);
 }
 
 // parameters define
 struct args_params_t : public argparse::Args {
-  bool& results = kwarg("results", "print generated results (default: false)")
-                      .set_default(true);
-  std::uint64_t& nd =
-      kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)")
-          .set_default(10);
-  std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
-  bool& help = flag("h, help", "print help");
-  bool& time = kwarg("t, time", "print time").set_default(true);
+    std::uint64_t& mat_size = kwarg("mat_size", "size of input matrix_size").set_default(10);
+    std::uint64_t& num_tiles = kwarg("num_tiles", "number of tiles").set_default(2);
+    int& verifycorrectness =
+        kwarg("verifycorrectness", "verify the tiled_cholesky results with LAPACKE_dpotrf cholesky").set_default(1);
+#if defined(CHOLESKY_OMP)
+    int& nthreads = kwarg("nthreads", "number of threads").set_default(1);
+#endif  // CHOLESKY_OMP
+    bool& lower_matrix = kwarg("lower_matrix", "print generated results (default: false)").set_default(true);
+    bool& help = flag("h, help", "print help");
+    bool& time = kwarg("t, time", "print time").set_default(true);
+    std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
 };
+
+// help functions for tiled_cholesky
+double* generate_positiveDefinitionMatrix(const size_t matrix_size) {
+    double* A_matrix = new double[matrix_size * matrix_size];
+    double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double));
+    unsigned int seeds = matrix_size;
+
+    // generate a random symmetric matrix
+    for (size_t row = 0; row < matrix_size; ++row) {
+        for (size_t col = row; col < matrix_size; ++col) {
+            A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX;
+        }
+    }
+    // compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
+                matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);
+
+    // Adjust Diagonals
+    for (size_t row = 0; row < matrix_size; ++row) {
+        double diagonals = 1.0;  // from 1.0
+        for (size_t col = 0; col < matrix_size; ++col) {
+            diagonals += pd_matrix[row * matrix_size + col];
+        }
+        // Set the diag entry
+        pd_matrix[row * matrix_size + row] = diagonals;
+    }
+
+    delete[] A_matrix;
+    return pd_matrix;
+}
+
+void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size,
+                      const int size, bool layRow) {
+
+    int total_num_tiles = num_tiles * num_tiles;
+    int offset_tile;
+
+    //#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
+    for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
+        if (layRow) {
+            offset_tile =
+                int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size;
+        } else {
+            offset_tile =
+                int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size;
+        }
+
+        for (int i = 0; i < tile_size; ++i)
+            //#pragma simd
+            for (int j = 0; j < tile_size; ++j) {
+                matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j];
+            }
+    }
+}
+
+double* assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size,
+                       bool layRow) {
+    int i_tile, j_tile, tile, i_local, j_local;
+    //#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
+    schedule(auto)
+    for (int i = 0; i < size; ++i) {
+        i_local = int(i % tile_size);
+        i_tile = int(i / tile_size);
+        //#pragma simd private(j_tile, tile, j_local)
+        for (int j = 0; j < size; ++j) {
+            j_tile = int(j / tile_size);
+            if (layRow) {
+                tile = i_tile * num_tiles + j_tile;
+            } else {
+                tile = j_tile * num_tiles + i_tile;
+            }
+            j_local = int(j % tile_size);
+            matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
+        }
+    }
+    return matrix;
+}
+
+bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) {
+    bool res = true;
+    double diff;
+    for (int i = 0; i < totalsize; ++i) {
+        diff = dporft_res[i] - lower_res[i];
+        if (fabs(dporft_res[i]) > 1e-5) {
+            diff /= dporft_res[i];
+        }
+        diff = fabs(diff);
+        if (diff > 1.0e-5) {
+            fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
+            res = false;
+            break;
+        }
+    }
+    return res;
+}
+
+void printLowerResults(const double* matrix, size_t matrix_size) {
+    for (size_t row = 0; row < matrix_size; ++row) {
+        for (size_t col = 0; col <= row; ++col) {
+            fmt::print("{}\t", matrix[row * matrix_size + col]);
+        }
+        fmt::print("\n");
+    }
+}
+
+void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) {
+    for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
+        fmt::print("Block {}:\n", itile);
+        for (int i = 0; i < tile_size; ++i) {
+            for (int j = 0; j < tile_size; ++j) {
+                fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
+            }
+            fmt::print("\n");
+        }
+        fmt::print("\n");
+    }
+}
\ No newline at end of file