Skip to content

Commit

Permalink
Add Kokkos PLGSY implementation to initialize tiles for POTRF and co
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Dec 18, 2023
1 parent b9727a4 commit 4901b30
Show file tree
Hide file tree
Showing 12 changed files with 387 additions and 67 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ else(MPI_FOUND)
set(TTG_HAVE_MPIEXT $<BOOL:false>)
endif(MPI_FOUND)

find_package(Kokkos COMPONENTS separable_compilation)

##########################
#### Examples
##########################
Expand Down
19 changes: 17 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@ if (TARGET tiledarray)
add_ttg_executable(testing_dlauum potrf/testing_dlauum.cc LINK_LIBRARIES tiledarray lapackpp)
add_ttg_executable(testing_dpoinv potrf/testing_dpoinv.cc LINK_LIBRARIES tiledarray lapackpp)

if (TARGET Kokkos::kokkos)
add_library(plgsy-device STATIC potrf/core_plgsy.cu)
target_link_libraries(plgsy-device PRIVATE Kokkos::kokkos)
target_compile_options(plgsy-device PRIVATE --extended-lambda --expt-relaxed-constexpr -I${CMAKE_CURRENT_SOURCE_DIR}/../ttg/ -I${CMAKE_CURRENT_SOURCE_DIR})
target_compile_definitions(plgsy-device PRIVATE -DTTG_HAVE_KOKKOS)
# TODO: target_include_directories does not seem to work with CUDA :/
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ttg/ ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/potrf)
#add_library(plgsy-host STATIC potrf/core_plgsy.cc)
#target_link_libraries(plgsy-host PRIVATE Kokkos::kokkos)
endif(TARGET Kokkos::kokkos)

if (TARGET CUDA::cublas)
add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas
Expand All @@ -26,7 +37,9 @@ if (TARGET tiledarray)
if (TARGET CUDA::cusolver)
add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
COMPILE_DEFINITIONS TTG_ENABLE_CUDA=1 #;DEBUG_TILES_VALUES=1
$<$<TARGET_EXISTS:Kokkos::kokkos>:$<TARGET_OBJECTS:Kokkos::kokkoscore>>
$<$<TARGET_EXISTS:Kokkos::kokkos>:plgsy-device>
COMPILE_DEFINITIONS TTG_ENABLE_CUDA=1;$<$<TARGET_EXISTS:Kokkos::kokkos>:TTG_HAVE_KOKKOS=1> #;DEBUG_TILES_VALUES=1
RUNTIMES "parsec")
endif(TARGET CUDA::cusolver)
elseif (TARGET roc::hipblas)
Expand All @@ -37,7 +50,9 @@ if (TARGET tiledarray)
if (TARGET roc::hipsolver)
add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
COMPILE_DEFINITIONS TTG_ENABLE_HIP=1;DEBUG_TILES_VALUES=1
$<$<TARGET_EXISTS:Kokkos::kokkos>:$<TARGET_OBJECTS:Kokkos::kokkoscore>>
$<$<TARGET_EXISTS:Kokkos::kokkos>:plgsy-device>
COMPILE_DEFINITIONS TTG_ENABLE_HIP=1;$<$<TARGET_EXISTS:Kokkos::kokkos>:TTG_HAVE_KOKKOS=1> #;DEBUG_TILES_VALUES=1
RUNTIMES "parsec")
endif(TARGET roc::hipsolver)
elseif (TARGET MKL::MKL_DPCPP)
Expand Down
167 changes: 167 additions & 0 deletions examples/potrf/core_plgsy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@

#include <Kokkos_Core.hpp>

//#include "ttg/device/device.h"

#include <iostream>


// fwd-decl
namespace ttg::device {
cudaStream_t current_stream();
} // namespace ttg::device

//#define KOKKOS_PLGSY_HOST

#ifdef KOKKOS_PLGSY_HOST
#define KOKKOS_SPACE Kokkos::HostSpace
#define KOKKOS_POLICY Kokkos::Serial
#define KOKKOS_POLICY_INSTANCE Kokkos::Serial()
#else
#define KOKKOS_SPACE Kokkos::Cuda::memory_space
#define KOKKOS_POLICY Kokkos::Cuda
#define KOKKOS_POLICY_INSTANCE Kokkos::Cuda(ttg::device::current_stream())
#endif

#include "random.h"

namespace detail {

template<typename T>
struct num_elems : std::integral_constant<int, 1>
{ };

template<typename T>
struct num_elems<std::complex<T>> : std::integral_constant<int, 2>
{ };

template<typename T>
static constexpr int num_elems_v = num_elems<T>::value;

template<typename T>
struct is_complex : std::integral_constant<bool, false>
{ };

template<typename T>
struct is_complex<std::complex<T>> : std::integral_constant<bool, true>
{ };

template<typename T>
static constexpr bool is_complex_v = is_complex<T>::value;
} // namespace detail

template <typename T>
void CORE_plgsy_device( T bump, int m, int n, T *A, int lda,
int gM, int m0, int n0, unsigned long long int seed ) {
static constexpr int nbelem = detail::num_elems_v<T>;
auto layout = Kokkos::LayoutStride(m, lda, n, 1);
auto view = Kokkos::View<T**, Kokkos::LayoutStride,
KOKKOS_SPACE>(A, layout);

//std::cout << "CORE_plgsy_device stream " << ttg::device::current_stream()
// << " buffer " << A
// << " m " << m << " n " << n
// << " lda " << lda << " gM " << gM << " m0 " << m0
// << " n0 " << n0 << " seed " << seed << std::endl;

auto rnd = KOKKOS_LAMBDA(unsigned long long int n, unsigned long long int seed) {
unsigned long long int a_k, c_k, ran;
a_k = Rnd64_A;
c_k = Rnd64_C;
ran = seed;
for (int i = 0; n; n >>= 1, ++i) {
if (n & 1)
ran = a_k * ran + c_k;
c_k *= (a_k + 1);
a_k *= a_k;
}

return ran;
};

auto gen = KOKKOS_LAMBDA(unsigned long long ran){
if constexpr(detail::is_complex_v<T>) {
return T((0.5f - ran * RndF_Mul), (0.5f - ran * RndF_Mul));
} else {
return (0.5f - ran * RndF_Mul);
}
};


KOKKOS_POLICY pol = KOKKOS_POLICY_INSTANCE;

if ( m0 == n0 ) {
/* diagonal */
Kokkos::parallel_for("diagonal",
Kokkos::MDRangePolicy<KOKKOS_POLICY, Kokkos::Rank<2>>(pol, {0, 0}, {n, m}),
KOKKOS_LAMBDA(int row, int col) {
unsigned long long int jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)gM;
jump += std::min(col, row)*(gM+1);
unsigned long long int ran;
ran = rnd( nbelem * jump, seed );
for (int i = 0; i < row; ++i) {
for (int j = i; j < col; ++j) {
ran = Rnd64_A*ran + Rnd64_C;
}
}
view(row, col) = gen(ran);
if (row == col) {
/* bump diagonal */
view(row, col) += bump;
}
}
);
} else if (m0 > n0) {
/* Lower part */
Kokkos::parallel_for("lower part",
Kokkos::MDRangePolicy<KOKKOS_POLICY, Kokkos::Rank<2>>(pol, {0, 0}, {n, m}),
KOKKOS_LAMBDA(int row, int col) {
unsigned long long int jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)gM;
jump += row*gM;
unsigned long long int ran;
ran = rnd( nbelem * jump, seed );
for (int i = 0; i < row*n+col; ++i) {
ran = Rnd64_A*ran + Rnd64_C;
}
view(row, col) = gen(ran);
}
);
} else {
/* upper part */
Kokkos::parallel_for("upper part",
Kokkos::MDRangePolicy<KOKKOS_POLICY, Kokkos::Rank<2>>(pol, {0, 0}, {n, m}),
KOKKOS_LAMBDA(int row, int col) {
unsigned long long int jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)gM;
jump += col*gM;
unsigned long long int ran;
ran = rnd( nbelem * jump, seed );
for (int i = 0; i < col*n+row; ++i) {
ran = Rnd64_A*ran + Rnd64_C;
}
view(row, col) = gen(ran);
}
);
}
}

/* implicit instantiations */
template void CORE_plgsy_device<float>(float bump, int m, int n, float *A, int lda,
int gM, int m0, int n0, unsigned long long int seed);

template void CORE_plgsy_device<double>(double bump, int m, int n, double *A, int lda,
int gM, int m0, int n0, unsigned long long int seed);

template void CORE_plgsy_device<std::complex<float>>(std::complex<float> bump, int m, int n, std::complex<float> *A, int lda,
int gM, int m0, int n0, unsigned long long int seed);

template void CORE_plgsy_device<std::complex<double>>(std::complex<double> bump, int m, int n, std::complex<double> *A, int lda,
int gM, int m0, int n0, unsigned long long int seed);


void kokkos_init(int& argc, char* argv[]) {
Kokkos::initialize(argc, argv);
}

void kokkos_finalize() {
Kokkos::finalize();
}
57 changes: 26 additions & 31 deletions examples/potrf/core_plgsy.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,23 +75,27 @@ void CORE_plgsy( T bump, int m, int n, T *A, int lda,

jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)gM;

/*
* Tile diagonal
*/
if ( m0 == n0 ) {
for (j = 0; j < n; j++) {
ran = Rnd64_jump( nbelem * jump, seed );

for (i = j; i < m; i++) {
auto fill_line = [](T* line, int from, int to, int ran, int ptr_advance){
T* tmp = line;
for (int i = from; i < to; i++) {
*tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
if constexpr (std::is_same<T, std::complex<double>>::value ||
std::is_same<T, std::complex<float>>::value ) {
*tmp += T(- ran * RndF_Mul, 0.5);
ran = Rnd64_A * ran + Rnd64_C;
*tmp += T(- ran * RndF_Mul, 0.5);
ran = Rnd64_A * ran + Rnd64_C;
}
tmp++;
tmp += ptr_advance;
}
};
/*
* Tile diagonal
*/
if ( m0 == n0 ) {
for (j = 0; j < n; j++) {
ran = Rnd64_jump( nbelem * jump, seed );
fill_line(tmp, j, m, ran, 1);
tmp += (lda - i + j + 1);
jump += gM + 1;
}
Expand All @@ -110,17 +114,7 @@ void CORE_plgsy( T bump, int m, int n, T *A, int lda,
else if ( m0 > n0 ) {
for (j = 0; j < n; j++) {
ran = Rnd64_jump( nbelem * jump, seed );

for (i = 0; i < m; i++) {
*tmp = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
if constexpr (std::is_same<T, std::complex<double>>::value ||
std::is_same<T, std::complex<float>>::value ) {
*tmp += T(- ran * RndF_Mul, 0.5);
ran = Rnd64_A * ran + Rnd64_C;
}
tmp++;
}
fill_line(tmp, 0, j, ran, 1);
tmp += (lda - i);
jump += gM;
}
Expand All @@ -134,17 +128,18 @@ void CORE_plgsy( T bump, int m, int n, T *A, int lda,

for (i = 0; i < m; i++) {
ran = Rnd64_jump( nbelem * jump, seed );

for (j = 0; j < n; j++) {
A[j*lda+i] = 0.5f - ran * RndF_Mul;
ran = Rnd64_A * ran + Rnd64_C;
if constexpr (std::is_same<T, std::complex<double>>::value ||
std::is_same<T, std::complex<float>>::value ) {
A[j*lda+i] += T(0.0, 0.5f - ran * RndF_Mul);
ran = Rnd64_A * ran + Rnd64_C;
}
}
fill_line(&A[i], 0, n, ran, lda);
jump += gM;
}
}
}


template <typename T>
void CORE_plgsy_device( T bump, int m, int n, T *A, int lda,
int gM, int m0, int n0, unsigned long long int seed );


template <typename T>
void CORE_plgsy_kokkos_host( T bump, int m, int n, T *A, int lda,
int gM, int m0, int n0, unsigned long long int seed );
22 changes: 22 additions & 0 deletions examples/potrf/norm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include "../devblas_helper.h"
#include "../matrixtile.h"


#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)

static void device_norm(const MatrixTile<double> &A, double *norm) {
auto size = A.size();
auto buffer = A.buffer().current_device_ptr();
std::cout << "device_norm ptr " << buffer << " device " << ttg::device::current_device()
<< " stream " << ttg::device::current_stream() << std::endl;
#if defined(TTG_HAVE_CUDA)
auto handle = cublas_handle();
//double n = 1.0;
cublasDnrm2(handle, size, buffer, 1, norm);
#elif defined(TTG_HAVE_HIPBLAS)
hipblasDnrm2(hipblas_handle(), size, buffer, 1, norm);
#endif
}
#endif // defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
Loading

0 comments on commit 4901b30

Please sign in to comment.