From 981b6181632af2e9899868700b66c0ea9cff7384 Mon Sep 17 00:00:00 2001 From: nscipione Date: Thu, 21 Mar 2024 09:21:55 +0000 Subject: [PATCH] Renaming template parameter and variable, avoid memory check if using buffer only Signed-off-by: nscipione --- include/operations/blas1_trees.h | 9 +++-- src/interface/blas1/backend/amd_gpu.hpp | 30 ++++++++++----- src/operations/blas1/WGAtomicReduction.hpp | 44 +++++++++++++--------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index 27a34b5ec..d005e1915 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -208,7 +208,8 @@ struct AssignReduction { * function below. * */ -template +template struct WGAtomicReduction { using value_t = typename lhs_t::value_t; using index_t = typename rhs_t::index_t; @@ -304,11 +305,11 @@ inline AssignReduction make_assign_reduction( lhs_, rhs_, local_num_thread_, global_num_thread_); } -template -inline WGAtomicReduction +inline WGAtomicReduction make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) { - return WGAtomicReduction(lhs_, rhs_); + return WGAtomicReduction(lhs_, rhs_); } template diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 7ec252995..e6d8e44ed 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -43,13 +43,17 @@ typename sb_handle_t::event_t _asum( *address space causing a big performance degradation, but making the kernel *behaves correctly also with managed memory (aka malloc_shared allocation). **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; @@ -133,13 +137,17 @@ typename sb_handle_t::event_t _nrm2( /** * Read comment in _asum above. **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; @@ -179,13 +187,17 @@ typename sb_handle_t::event_t _dot( /** * Read comment in _asum above. **/ - bool managed_mem{false}; +#ifdef SB_ENABLE_USM + bool usm_managed_mem{false}; if constexpr (std::is_pointer_v) { - managed_mem = + usm_managed_mem = sycl::usm::alloc::shared == sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); } - if (managed_mem) { +#else + constexpr bool usm_managed_mem{false}; +#endif + if (usm_managed_mem) { if (_N < (1 << 18)) { constexpr index_t localSize = 1024; const index_t number_WG = (_N + localSize - 1) / localSize; diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 779c33b43..21756c7c0 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -35,29 +35,34 @@ namespace blas { * and atomics operation to combine the results. * * */ -template -WGAtomicReduction::WGAtomicReduction( +template +WGAtomicReduction::WGAtomicReduction( lhs_t& _l, rhs_t& _r) : lhs_(_l), rhs_(_r){}; -template +template PORTBLAS_INLINE - typename WGAtomicReduction::index_t - WGAtomicReduction::get_size() const { + typename WGAtomicReduction::index_t + WGAtomicReduction::get_size() + const { return rhs_.get_size(); } -template +template PORTBLAS_INLINE bool -WGAtomicReduction::valid_thread( +WGAtomicReduction::valid_thread( cl::sycl::nd_item<1> ndItem) const { return true; } -template +template PORTBLAS_INLINE - typename WGAtomicReduction::value_t - WGAtomicReduction::eval( + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( cl::sycl::nd_item<1> ndItem) { auto atomic_res = cl::sycl::atomic_ref +template template PORTBLAS_INLINE - typename WGAtomicReduction::value_t - WGAtomicReduction::eval( + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( sharedT scratch, cl::sycl::nd_item<1> ndItem) { const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); @@ -120,7 +126,7 @@ PORTBLAS_INLINE cl::sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - if constexpr (!managed_mem) { + if constexpr (!usmManagedMem) { auto atomic_res = cl::sycl::atomic_ref -PORTBLAS_INLINE void WGAtomicReduction +PORTBLAS_INLINE void WGAtomicReduction::bind(cl::sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } -template -PORTBLAS_INLINE void WGAtomicReduction +PORTBLAS_INLINE void WGAtomicReduction::adjust_access_displacement() { lhs_.adjust_access_displacement(); rhs_.adjust_access_displacement();