Skip to content

Commit

Permalink
Merge pull request #7 from therault/potrf-cuda-wip
Browse files Browse the repository at this point in the history
Add accelerator support for POTRF
  • Loading branch information
devreal authored Oct 11, 2023
2 parents 366e60e + 94f3a81 commit fa4aedf
Show file tree
Hide file tree
Showing 24 changed files with 811 additions and 342 deletions.
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization
option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF)
option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF)
option(TTG_EXAMPLES "Whether to build examples" OFF)
option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)

option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF)
Expand All @@ -68,6 +69,11 @@ if (FORCE_COLORED_OUTPUT)
endif ()
endif (FORCE_COLORED_OUTPUT)

if (TTG_ENABLE_ASAN)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
endif (TTG_ENABLE_ASAN)

set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)")
##########################
#### prerequisites
Expand Down Expand Up @@ -120,6 +126,11 @@ if (TTG_ENABLE_HIP)
if (TARGET roc::hipblas)
set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS")
endif()

find_package(hipsolver)
if (TARGET roc::hipsolver)
set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver")
endif()
add_compile_definitions(${TTG_HIP_PLATFORM})
endif(TTG_ENABLE_HIP)

Expand Down
12 changes: 12 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,23 @@ if (TARGET tiledarray)
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
RUNTIMES "parsec")

if (TARGET CUDA::cusolver)
add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
RUNTIMES "parsec")
endif(TARGET CUDA::cusolver)
elseif (TARGET roc::hipblas)
add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas
COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
RUNTIMES "parsec")
if (TARGET roc::hipsolver)
add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
RUNTIMES "parsec")
endif(TARGET roc::hipsolver)
endif()

add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
Expand Down
128 changes: 128 additions & 0 deletions examples/devblas_helper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#include "ttg/config.h"

#include <memory>
#include <stdexcept>
#include <optional>
#include <map>

#ifdef TTG_HAVE_CUDART

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>

/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
/// \return the cuBLAS handle for the current thread
template<typename T = int>
inline const cublasHandle_t& cublas_handle(T _ = 0) {
using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
static thread_local map_type handles;

int device = ttg::device::current_device();
cudaStream_t stream = ttg::device::current_stream();

map_type::iterator it;
if ((it = handles.find({device, stream})) == handles.end()){
cublasHandle_t handle;
auto status = cublasCreate_v2(&handle);
if (CUBLAS_STATUS_SUCCESS != status) {
std::cerr << "cublasCreate_v2 failed: " << status << std::endl;
throw std::runtime_error("cublasCreate_v2 failed");
}
status = cublasSetStream_v2(handle, ttg::device::current_stream());
if (CUBLAS_STATUS_SUCCESS != status) {
std::cerr << "cublasSetStream_v2 failed: " << status << std::endl;
throw std::runtime_error("cublasSetStream_v2 failed");
}
auto [iterator, success] = handles.insert({{device, stream}, handle});
it = iterator;
}
return it->second;
}

template<typename T = int>
inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) {

using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
static thread_local map_type handles;

int device = ttg::device::current_device();
cudaStream_t stream = ttg::device::current_stream();

map_type::iterator it;
if ((it = handles.find({device, stream})) == handles.end()){
cusolverDnHandle_t handle;
auto status = cusolverDnCreate(&handle);
if (CUSOLVER_STATUS_SUCCESS != status) {
std::cerr << "cusolverDnCreate failed: " << status << std::endl;
throw std::runtime_error("cusolverDnCreate failed");
}
status = cusolverDnSetStream(handle, stream);
if (CUSOLVER_STATUS_SUCCESS != status) {
std::cerr << "cusolverDnSetStream failed: " << status << std::endl;
throw std::runtime_error("cusolverDnSetStream failed");
}

std::cout << "Creating cusolver handle " << handle << " for device " << device << " stream " << stream << std::endl;
auto [iterator, success] = handles.insert({{device, stream}, handle});
it = iterator;
} else {
std::cout << "Found cusolver handle " << it->second << " for device " << device << " stream " << stream << std::endl;
}

return it->second;
}
#endif // TTG_HAVE_CUDART

#ifdef TTG_HAVE_HIPBLAS

#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hipsolver/hipsolver.h>

/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
/// \return the rocBLAS handle for the current thread
template<typename T = int>
const hipblasHandle_t& hipblas_handle(T _ = 0) {
static thread_local std::map<int, hipblasHandle_t> handles;
int device = ttg::device::current_device();
std::map<int, hipblasHandle_t>::iterator it;
if ((it = handles.find(device)) == handles.end()){
hipblasHandle_t handle;
auto status = hipblasCreate(&handle);
if (HIPBLAS_STATUS_SUCCESS != status) {
throw std::runtime_error("hipblasCreate failed");
}
auto [iterator, success] = handles.insert({device, handle});
it = iterator;
}
hipblasStatus_t status = hipblasSetStream(it->second, ttg::device::current_stream());
if (HIPBLAS_STATUS_SUCCESS != status) {
throw std::runtime_error("hipblasSetStream failed");
}
return it->second;
}

/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
/// \return the hipsolver handle for the current thread
template<typename T = int>
const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) {
static thread_local std::map<int, hipsolverDnHandle_t> handles;
int device = ttg::device::current_device();
std::map<int, hipsolverDnHandle_t>::iterator it;
if ((it = handles.find(device)) == handles.end()){
hipsolverDnHandle_t handle;
auto status = hipsolverDnCreate(&handle);
if (HIPSOLVER_STATUS_SUCCESS != status) {
throw std::runtime_error("hipsolverCreate failed");
}
auto [iterator, success] = handles.insert({device, handle});
it = iterator;
}
hipsolverStatus_t status = hipsolverDnSetStream(it->second, ttg::device::current_stream());
if (HIPSOLVER_STATUS_SUCCESS != status) {
throw std::runtime_error("hipsolverSetStream failed");
}
return it->second;
}
#endif // TTG_HAVE_HIPBLAS
98 changes: 45 additions & 53 deletions examples/matrixtile.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,111 +6,94 @@

#include <ttg/serialization/splitmd_data_descriptor.h>

template <typename T>
class MatrixTile {
#include <TiledArray/device/allocators.h>

template <typename T, class Allocator = TiledArray::device_pinned_allocator<double>>
class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
public:
using metadata_t = typename std::tuple<int, int, int>;

using pointer_t = typename std::shared_ptr<T>;
using buffer_t = typename ttg::buffer<T, Allocator>;
using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;

private:
pointer_t _data;
buffer_t _buffer;
int _rows = 0, _cols = 0, _lda = 0;

// (Re)allocate the tile memory
void realloc() {
// std::cout << "Reallocating new tile" << std::endl;
_data = std::shared_ptr<T>(new T[_lda * _cols], [](T* p) { delete[] p; });
_buffer.reset(_lda * _cols);
}

public:
MatrixTile() {}

MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); }
MatrixTile(int rows, int cols, int lda)
: ttvalue_type()
, _buffer(lda*cols)
, _rows(rows)
, _cols(cols)
, _lda(lda)
{ }

MatrixTile(const metadata_t& metadata)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}

MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {}

MatrixTile(const metadata_t& metadata, pointer_t data)
MatrixTile(const metadata_t& metadata, T* data)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}

/**
* Constructor with outside memory. The tile will *not* delete this memory
* upon destruction.
*/
MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {}

MatrixTile(const metadata_t& metadata, T* data)
: MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}

#if 0
/* Copy dtor and operator with a static_assert to catch unexpected copying */
MatrixTile(const MatrixTile& other) {
static_assert("Oops, copy ctor called?!");
}

MatrixTile& operator=(const MatrixTile& other) {
static_assert("Oops, copy ctor called?!");
}
#endif

MatrixTile(MatrixTile<T>&& other) = default;
MatrixTile(int rows, int cols, T* data, int lda)
: ttvalue_type()
, _buffer(data, lda*cols)
, _rows(rows)
, _cols(cols)
, _lda(lda)
{ }

MatrixTile& operator=(MatrixTile<T>&& other) = default;
MatrixTile(MatrixTile<T, Allocator>&& other) = default;

#if 0
/* Defaulted copy ctor and op for shallow copies, see comment below */
MatrixTile(const MatrixTile<T>& other) = default;
MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;

MatrixTile& operator=(const MatrixTile<T>& other) = default;
#endif // 0
/* Deep copy ctor und op are not needed for PO since tiles will never be read
* and written concurrently. Hence shallow copies are enough, will all
* receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
* can handle data sharing without excessive copying */
#if 1
MatrixTile(const MatrixTile<T>& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) {
this->realloc();
MatrixTile(const MatrixTile<T, Allocator>& other)
: ttvalue_type()
, _buffer(other._lda*other._cols)
, _rows(other._rows)
, _cols(other._cols)
, _lda(other._lda) {
std::copy_n(other.data(), _lda * _cols, this->data());
}

MatrixTile& operator=(const MatrixTile<T>& other) {
MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
this->_rows = other._rows;
this->_cols = other._cols;
this->_lda = other._lda;
this->realloc();
std::copy_n(other.data(), _lda * _cols, this->data());
return *this;
}
#endif // 1

void set_metadata(metadata_t meta) {
_rows = std::get<0>(meta);
_cols = std::get<1>(meta);
_lda = std::get<2>(meta);
this->realloc();
}

metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }

// Accessing the raw data
T* data() { return _data.get(); }

const T* data() const { return _data.get(); }

/// @return shared_ptr to data
pointer_t data_shared() & { return _data; }

/// @return shared_ptr to data
pointer_t data_shared() const& { return _data; }
T* data() { return _buffer.host_ptr(); }

/// yields data and resets this object to a default-constucted state
pointer_t yield_data() && {
pointer_t result = _data;
*this = MatrixTile();
return std::move(result);
}
const T* data() const { return _buffer.host_ptr(); }

size_t size() const { return _cols * _lda; }

Expand All @@ -120,8 +103,17 @@ class MatrixTile {

int lda() const { return _lda; }

buffer_t& buffer() {
return _buffer;
}

const buffer_t& buffer() const {
return _buffer;
}

auto& fill(T value) {
std::fill(_data.get(), _data.get() + size(), value);
std::fill(data().get(), data().get() + size(), value);
_buffer.set_current_device(0);
return *this;
}

Expand Down
Loading

0 comments on commit fa4aedf

Please sign in to comment.