From 981b6181632af2e9899868700b66c0ea9cff7384 Mon Sep 17 00:00:00 2001
From: nscipione <nicolo.scipione@codeplay.com>
Date: Thu, 21 Mar 2024 09:21:55 +0000
Subject: [PATCH] Renaming template parameter and variable, avoid memory check
 if using buffer only

Signed-off-by: nscipione <nicolo.scipione@codeplay.com>
---
 include/operations/blas1_trees.h           |  9 +++--
 src/interface/blas1/backend/amd_gpu.hpp    | 30 ++++++++++-----
 src/operations/blas1/WGAtomicReduction.hpp | 44 +++++++++++++---------
 3 files changed, 52 insertions(+), 31 deletions(-)
diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h
index 27a34b5ec..d005e1915 100644
--- a/include/operations/blas1_trees.h
+++ b/include/operations/blas1_trees.h
@@ -208,7 +208,8 @@ struct AssignReduction {
  * function below.
  *
  */
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 struct WGAtomicReduction {
   using value_t = typename lhs_t::value_t;
   using index_t = typename rhs_t::index_t;
@@ -304,11 +305,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
       lhs_, rhs_, local_num_thread_, global_num_thread_);
 }
 
-template <typename operator_t, bool managed_mem = false, typename lhs_t,
+template <typename operator_t, bool usmManagedMem = false, typename lhs_t,
           typename rhs_t>
-inline WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>
+inline WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>
 make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
-  return WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>(lhs_, rhs_);
+  return WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>(lhs_, rhs_);
 }
 
 template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp
index 7ec252995..e6d8e44ed 100644
--- a/src/interface/blas1/backend/amd_gpu.hpp
+++ b/src/interface/blas1/backend/amd_gpu.hpp
@@ -43,13 +43,17 @@ typename sb_handle_t::event_t _asum(
    *address space causing a big performance degradation, but making the kernel
    *behaves correctly also with managed memory (aka malloc_shared allocation).
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
@@ -133,13 +137,17 @@ typename sb_handle_t::event_t _nrm2(
   /**
    * Read comment in _asum above.
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
@@ -179,13 +187,17 @@ typename sb_handle_t::event_t _dot(
   /**
    * Read comment in _asum above.
    **/
-  bool managed_mem{false};
+#ifdef SB_ENABLE_USM
+  bool usm_managed_mem{false};
   if constexpr (std::is_pointer_v<decltype(_rs)>) {
-    managed_mem =
+    usm_managed_mem =
         sycl::usm::alloc::shared ==
         sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
   }
-  if (managed_mem) {
+#else
+  constexpr bool usm_managed_mem{false};
+#endif
+  if (usm_managed_mem) {
     if (_N < (1 << 18)) {
       constexpr index_t localSize = 1024;
       const index_t number_WG = (_N + localSize - 1) / localSize;
diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp
index 779c33b43..21756c7c0 100644
--- a/src/operations/blas1/WGAtomicReduction.hpp
+++ b/src/operations/blas1/WGAtomicReduction.hpp
@@ -35,29 +35,34 @@ namespace blas {
  * and atomics operation to combine the results.
  *
  * */
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::WGAtomicReduction(
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::WGAtomicReduction(
     lhs_t& _l, rhs_t& _r)
     : lhs_(_l), rhs_(_r){};
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::index_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::get_size() const {
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::index_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::get_size()
+        const {
   return rhs_.get_size();
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE bool
-WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::valid_thread(
+WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::valid_thread(
     cl::sycl::nd_item<1> ndItem) const {
   return true;
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::eval(
         cl::sycl::nd_item<1> ndItem) {
   auto atomic_res =
       cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
@@ -85,11 +90,12 @@ PORTBLAS_INLINE
   return {};
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
 template <typename sharedT>
 PORTBLAS_INLINE
-    typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
-    WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
+    typename WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::value_t
+    WGAtomicReduction<operator_t, usmManagedMem, lhs_t, rhs_t>::eval(
         sharedT scratch, cl::sycl::nd_item<1> ndItem) {
   const auto size = get_size();
   const int lid = static_cast<int>(ndItem.get_global_linear_id());
@@ -120,7 +126,7 @@ PORTBLAS_INLINE
                                       cl::sycl::plus<value_t>());
   }
   if (ndItem.get_local_id()[0] == 0) {
-    if constexpr (!managed_mem) {
+    if constexpr (!usmManagedMem) {
       auto atomic_res =
           cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
                                cl::sycl::memory_scope::device,
@@ -140,15 +146,17 @@ PORTBLAS_INLINE
   return {};
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, usmManagedMem, lhs_t,
                                        rhs_t>::bind(cl::sycl::handler& h) {
   lhs_.bind(h);
   rhs_.bind(h);
 }
 
-template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
-PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
+template <typename operator_t, bool usmManagedMem, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void WGAtomicReduction<operator_t, usmManagedMem, lhs_t,
                                        rhs_t>::adjust_access_displacement() {
   lhs_.adjust_access_displacement();
   rhs_.adjust_access_displacement();