Add check for managed usm allocation for AMD

AMD atomic operation implementation requires some specific hardware to work properly with current reduction kernel. This patch adds a check for AMD only to provides the correct result even if the specific hardware is not available. Signed-off-by: nscipione <[email protected]>
codeplaysoftware · Mar 21, 2024 · a532beb · a532beb
1 parent eae3b23
commit a532beb
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 48 deletions.
diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h
@@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
  * \brief Prototype for the internal implementation of the ASUM operation. See
  * documentation in the blas1_interface.hpp file for details.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem = false,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _asum_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,

diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
@@ -208,7 +208,7 @@ struct AssignReduction {
  * function below.
  *
  */
-template <typename operator_t, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
 struct WGAtomicReduction {
   using value_t = typename lhs_t::value_t;
   using index_t = typename rhs_t::index_t;
@@ -304,10 +304,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
       lhs_, rhs_, local_num_thread_, global_num_thread_);
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
-    lhs_t &lhs_, rhs_t &rhs_) {
-  return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
+template <typename operator_t, bool managed_mem = false, typename lhs_t,
+          typename rhs_t>
+inline WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>
+make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
+  return WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
 template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>

diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
@@ -34,16 +34,45 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
 typename sb_handle_t::event_t _asum(
     sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
-  if (_N < (1 << 18)) {
-    constexpr index_t localSize = 1024;
-    const index_t number_WG = (_N + localSize - 1) / localSize;
-    return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+  /**
+   * This compile time check is absolutely necessary for AMD gpu.
+   * AMD atomic operations required a specific combination of hardware that we
+   *cannot check neither enforce to users. Since reduction operators kernel
+   *implementation useses atomic operation without that particular combination
+   *the operator may fail silently. This check enforce a different atomic
+   *address space causing a big performance degradation, but making the kernel
+   *behaves correctly also with managed memory (aka malloc_shared allocation).
+   **/
+  bool managed_mem{false};
+  if constexpr (std::is_pointer_v<decltype(_rs)>) {
+    managed_mem =
+        sycl::usm::alloc::shared ==
+        sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
+  }
+  if (managed_mem) {
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, true>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   } else {
-    constexpr int localSize = 512;
-    constexpr index_t number_WG = 256;
-    return blas::internal::_asum_impl<localSize, 32>(
-        sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    if (_N < (1 << 18)) {
+      constexpr index_t localSize = 1024;
+      const index_t number_WG = (_N + localSize - 1) / localSize;
+      return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    } else {
+      constexpr int localSize = 512;
+      constexpr index_t number_WG = 256;
+      return blas::internal::_asum_impl<localSize, 32, false>(
+          sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
+    }
   }
 }
 }  // namespace backend

diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp
@@ -226,9 +226,9 @@ typename sb_handle_t::event_t _asum(
  *                      implementation use a kernel implementation which doesn't
  *                      require local memory.
  */
-template <int localSize, int localMemSize, typename sb_handle_t,
-          typename container_0_t, typename container_1_t, typename index_t,
-          typename increment_t>
+template <int localSize, int localMemSize, bool usmManagedMem,
+          typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename index_t, typename increment_t>
 typename sb_handle_t::event_t _asum_impl(
     sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
     container_1_t _rs, const index_t number_WG,
@@ -238,7 +238,8 @@ typename sb_handle_t::event_t _asum_impl(
   auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
                              static_cast<index_t>(1));
   typename sb_handle_t::event_t ret;
-  auto asumOp = make_wg_atomic_reduction<AbsoluteAddOperator>(rs, vx);
+  auto asumOp =
+      make_wg_atomic_reduction<AbsoluteAddOperator, usmManagedMem>(rs, vx);
   if constexpr (localMemSize != 0) {
     ret = sb_handle.execute(asumOp, static_cast<index_t>(localSize),
                             static_cast<index_t>(number_WG * localSize),

diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
@@ -35,26 +35,30 @@ namespace blas {
  * and atomics operation to combine the results.
  *
  * */
-template <typename operator_t, typename lhs_t, typename rhs_t>
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::WGAtomicReduction(lhs_t& _l,
-                                                               rhs_t& _r)
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::WGAtomicReduction(
+    lhs_t& _l, rhs_t& _r)
     : lhs_(_l), rhs_(_r){};
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::index_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::get_size() const {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::index_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::get_size() const {
   return rhs_.get_size();
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE bool WGAtomicReduction<operator_t, lhs_t, rhs_t>::valid_thread(
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE bool
+WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::valid_thread(
     cl::sycl::nd_item<1> ndItem) const {
   return true;
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+        cl::sycl::nd_item<1> ndItem) {
   auto atomic_res =
       cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
                            cl::sycl::memory_scope::device,
@@ -80,16 +84,13 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
   }
   return {};
 }
-template <typename operator_t, typename lhs_t, typename rhs_t>
+
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
 template <typename sharedT>
-PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
-                                                  cl::sycl::nd_item<1> ndItem) {
-  auto atomic_res =
-      cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
-                           cl::sycl::memory_scope::device,
-                           cl::sycl::access::address_space::global_space>(
-          lhs_.get_data()[0]);
+PORTBLAS_INLINE
+    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+        sharedT scratch, cl::sycl::nd_item<1> ndItem) {
   const auto size = get_size();
   const int lid = static_cast<int>(ndItem.get_global_linear_id());
   const auto loop_stride =
@@ -119,22 +120,36 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
                                       cl::sycl::plus<value_t>());
   }
   if (ndItem.get_local_id()[0] == 0) {
-    atomic_res += val;
+    if constexpr (!managed_mem) {
+      auto atomic_res =
+          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                               cl::sycl::memory_scope::device,
+                               cl::sycl::access::address_space::global_space>(
+              lhs_.get_data()[0]);
+      atomic_res += val;
+    } else {
+      auto atomic_res =
+          cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
+                               cl::sycl::memory_scope::device,
+                               cl::sycl::access::address_space::generic_space>(
+              lhs_.get_data()[0]);
+      atomic_res += val;
+    }
   }
 
   return {};
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, lhs_t, rhs_t>::bind(
-    cl::sycl::handler& h) {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+                                       rhs_t>::bind(cl::sycl::handler& h) {
   lhs_.bind(h);
   rhs_.bind(h);
 }
 
-template <typename operator_t, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void
-WGAtomicReduction<operator_t, lhs_t, rhs_t>::adjust_access_displacement() {
+template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+                                       rhs_t>::adjust_access_displacement() {
   lhs_.adjust_access_displacement();
   rhs_.adjust_access_displacement();
 }