Merge pull request #7 from therault/potrf-cuda-wip

Add accelerator support for POTRF
TESSEorg · Oct 11, 2023 · fa4aedf · fa4aedf
2 parents 366e60e + 94f3a81
commit fa4aedf
Show file tree

Hide file tree

Showing 24 changed files with 811 additions and 342 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization
 option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF)
 option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF)
 option(TTG_EXAMPLES "Whether to build examples" OFF)
+option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
 option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
 option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF)
@@ -68,6 +69,11 @@ if (FORCE_COLORED_OUTPUT)
     endif ()
 endif (FORCE_COLORED_OUTPUT)
 
+if (TTG_ENABLE_ASAN)
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
+endif (TTG_ENABLE_ASAN)
+
 set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)")
 ##########################
 #### prerequisites
@@ -120,6 +126,11 @@ if (TTG_ENABLE_HIP)
   if (TARGET roc::hipblas)
     set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS")
   endif()
+
+  find_package(hipsolver)
+  if (TARGET roc::hipsolver)
+    set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver")
+  endif()
   add_compile_definitions(${TTG_HIP_PLATFORM})
 endif(TTG_ENABLE_HIP)
 

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -16,11 +16,23 @@ if (TARGET tiledarray)
                          LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas
                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                          RUNTIMES "parsec")
+
+      if (TARGET CUDA::cusolver)
+        add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
+                          LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
+                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
+                          RUNTIMES "parsec")
+      endif(TARGET CUDA::cusolver)
     elseif (TARGET roc::hipblas)
       add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
                         LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas
                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                         RUNTIMES "parsec")
+      if (TARGET roc::hipsolver)
+        add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
+                           LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
+                           RUNTIMES "parsec")
+      endif(TARGET roc::hipsolver)
     endif()
 
     add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")

diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h
@@ -0,0 +1,128 @@
+#include "ttg/config.h"
+
+#include <memory>
+#include <stdexcept>
+#include <optional>
+#include <map>
+
+#ifdef TTG_HAVE_CUDART
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
+/// \return the cuBLAS handle for the current thread
+template<typename T = int>
+inline const cublasHandle_t& cublas_handle(T _ = 0) {
+  using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
+  static thread_local map_type handles;
+
+  int device = ttg::device::current_device();
+  cudaStream_t stream = ttg::device::current_stream();
+
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cublasHandle_t handle;
+    auto status = cublasCreate_v2(&handle);
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasCreate_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasCreate_v2 failed");
+    }
+    status = cublasSetStream_v2(handle, ttg::device::current_stream());
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasSetStream_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasSetStream_v2 failed");
+    }
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+  return it->second;
+}
+
+template<typename T = int>
+inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) {
+
+  using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
+  static thread_local map_type handles;
+
+  int device = ttg::device::current_device();
+  cudaStream_t stream = ttg::device::current_stream();
+
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cusolverDnHandle_t handle;
+    auto status = cusolverDnCreate(&handle);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnCreate failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnCreate failed");
+    }
+    status = cusolverDnSetStream(handle, stream);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnSetStream failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnSetStream failed");
+    }
+
+    std::cout << "Creating cusolver handle " << handle << " for device " << device << " stream " << stream << std::endl;
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  } else {
+    std::cout << "Found cusolver handle " << it->second << " for device " << device << " stream " << stream << std::endl;
+  }
+
+  return it->second;
+}
+#endif // TTG_HAVE_CUDART
+
+#ifdef TTG_HAVE_HIPBLAS
+
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsolver/hipsolver.h>
+
+/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
+/// \return the rocBLAS handle for the current thread
+template<typename T = int>
+const hipblasHandle_t& hipblas_handle(T _ = 0) {
+  static thread_local std::map<int, hipblasHandle_t> handles;
+  int device = ttg::device::current_device();
+  std::map<int, hipblasHandle_t>::iterator it;
+  if ((it = handles.find(device)) == handles.end()){
+    hipblasHandle_t handle;
+    auto status = hipblasCreate(&handle);
+    if (HIPBLAS_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipblasCreate failed");
+    }
+    auto [iterator, success] = handles.insert({device, handle});
+    it = iterator;
+  }
+  hipblasStatus_t status = hipblasSetStream(it->second, ttg::device::current_stream());
+  if (HIPBLAS_STATUS_SUCCESS != status) {
+    throw std::runtime_error("hipblasSetStream failed");
+  }
+  return it->second;
+}
+
+/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
+/// \return the hipsolver handle for the current thread
+template<typename T = int>
+const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) {
+  static thread_local std::map<int, hipsolverDnHandle_t> handles;
+  int device = ttg::device::current_device();
+  std::map<int, hipsolverDnHandle_t>::iterator it;
+  if ((it = handles.find(device)) == handles.end()){
+    hipsolverDnHandle_t handle;
+    auto status = hipsolverDnCreate(&handle);
+    if (HIPSOLVER_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipsolverCreate failed");
+    }
+    auto [iterator, success] = handles.insert({device, handle});
+    it = iterator;
+  }
+  hipsolverStatus_t status = hipsolverDnSetStream(it->second, ttg::device::current_stream());
+  if (HIPSOLVER_STATUS_SUCCESS != status) {
+    throw std::runtime_error("hipsolverSetStream failed");
+  }
+  return it->second;
+}
+#endif // TTG_HAVE_HIPBLAS
diff --git a/examples/matrixtile.h b/examples/matrixtile.h
@@ -6,111 +6,94 @@
 
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
-template <typename T>
-class MatrixTile {
+#include <TiledArray/device/allocators.h>
+
+template <typename T, class Allocator = TiledArray::device_pinned_allocator<double>>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
  public:
   using metadata_t = typename std::tuple<int, int, int>;
 
-  using pointer_t = typename std::shared_ptr<T>;
+  using buffer_t  = typename ttg::buffer<T, Allocator>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
 
  private:
-  pointer_t _data;
+  buffer_t _buffer;
   int _rows = 0, _cols = 0, _lda = 0;
 
   // (Re)allocate the tile memory
   void realloc() {
     // std::cout << "Reallocating new tile" << std::endl;
-    _data = std::shared_ptr<T>(new T[_lda * _cols], [](T* p) { delete[] p; });
+    _buffer.reset(_lda * _cols);
   }
 
  public:
   MatrixTile() {}
 
-  MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); }
+  MatrixTile(int rows, int cols, int lda)
+  : ttvalue_type()
+  , _buffer(lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
   MatrixTile(const metadata_t& metadata)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}
 
-  MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {}
-
-  MatrixTile(const metadata_t& metadata, pointer_t data)
+  MatrixTile(const metadata_t& metadata, T* data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}
 
   /**
    * Constructor with outside memory. The tile will *not* delete this memory
    * upon destruction.
    */
-  MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {}
-
-  MatrixTile(const metadata_t& metadata, T* data)
-      : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}
-
-#if 0
-  /* Copy dtor and operator with a static_assert to catch unexpected copying */
-  MatrixTile(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-
-  MatrixTile& operator=(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-#endif
-
-  MatrixTile(MatrixTile<T>&& other) = default;
+  MatrixTile(int rows, int cols, T* data, int lda)
+  : ttvalue_type()
+  , _buffer(data, lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
-  MatrixTile& operator=(MatrixTile<T>&& other) = default;
+  MatrixTile(MatrixTile<T, Allocator>&& other) = default;
 
-#if 0
-  /* Defaulted copy ctor and op for shallow copies, see comment below */
-  MatrixTile(const MatrixTile<T>& other)  = default;
+  MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
 
-  MatrixTile& operator=(const MatrixTile<T>& other)  = default;
-#endif  // 0
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-#if 1
-  MatrixTile(const MatrixTile<T>& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) {
-    this->realloc();
+  MatrixTile(const MatrixTile<T, Allocator>& other)
+  : ttvalue_type()
+  , _buffer(other._lda*other._cols)
+  , _rows(other._rows)
+  , _cols(other._cols)
+  , _lda(other._lda) {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
-  MatrixTile& operator=(const MatrixTile<T>& other) {
+  MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
     this->_rows = other._rows;
     this->_cols = other._cols;
     this->_lda = other._lda;
     this->realloc();
     std::copy_n(other.data(), _lda * _cols, this->data());
     return *this;
   }
-#endif  // 1
 
   void set_metadata(metadata_t meta) {
     _rows = std::get<0>(meta);
     _cols = std::get<1>(meta);
     _lda = std::get<2>(meta);
+    this->realloc();
   }
 
   metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }
 
   // Accessing the raw data
-  T* data() { return _data.get(); }
-
-  const T* data() const { return _data.get(); }
-
-  /// @return shared_ptr to data
-  pointer_t data_shared() & { return _data; }
-
-  /// @return shared_ptr to data
-  pointer_t data_shared() const& { return _data; }
+  T* data() { return _buffer.host_ptr(); }
 
-  /// yields data and resets this object to a default-constucted state
-  pointer_t yield_data() && {
-    pointer_t result = _data;
-    *this = MatrixTile();
-    return std::move(result);
-  }
+  const T* data() const { return _buffer.host_ptr(); }
 
   size_t size() const { return _cols * _lda; }
 
@@ -120,8 +103,17 @@ class MatrixTile {
 
   int lda() const { return _lda; }
 
+  buffer_t& buffer() {
+    return _buffer;
+  }
+
+  const buffer_t& buffer() const {
+    return _buffer;
+  }
+
   auto& fill(T value) {
-    std::fill(_data.get(), _data.get() + size(), value);
+    std::fill(data().get(), data().get() + size(), value);
+    _buffer.set_current_device(0);
     return *this;
   }