diff --git a/include/interface/blas1_interface.h b/include/interface/blas1_interface.h index 7684a0a65..80d104e01 100644 --- a/include/interface/blas1_interface.h +++ b/include/interface/blas1_interface.h @@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum( * \brief Prototype for the internal implementation of the ASUM operation. See * documentation in the blas1_interface.hpp file for details. */ -template +template typename sb_handle_t::event_t _asum_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, diff --git a/include/operations/blas1_trees.h b/include/operations/blas1_trees.h index f0d94d596..27a34b5ec 100644 --- a/include/operations/blas1_trees.h +++ b/include/operations/blas1_trees.h @@ -208,7 +208,7 @@ struct AssignReduction { * function below. * */ -template +template struct WGAtomicReduction { using value_t = typename lhs_t::value_t; using index_t = typename rhs_t::index_t; @@ -304,10 +304,11 @@ inline AssignReduction make_assign_reduction( lhs_, rhs_, local_num_thread_, global_num_thread_); } -template -inline WGAtomicReduction make_wg_atomic_reduction( - lhs_t &lhs_, rhs_t &rhs_) { - return WGAtomicReduction(lhs_, rhs_); +template +inline WGAtomicReduction +make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) { + return WGAtomicReduction(lhs_, rhs_); } template diff --git a/src/interface/blas1/backend/amd_gpu.hpp b/src/interface/blas1/backend/amd_gpu.hpp index 999b596df..741e7f730 100644 --- a/src/interface/blas1/backend/amd_gpu.hpp +++ b/src/interface/blas1/backend/amd_gpu.hpp @@ -34,16 +34,45 @@ template (localSize), 32>( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + /** + * This compile time check is absolutely necessary for AMD gpu. + * AMD atomic operations required a specific combination of hardware that we + *cannot check neither enforce to users. Since reduction operators kernel + *implementation useses atomic operation without that particular combination + *the operator may fail silently. This check enforce a different atomic + *address space causing a big performance degradation, but making the kernel + *behaves correctly also with managed memory (aka malloc_shared allocation). + **/ + bool managed_mem{false}; + if constexpr (std::is_pointer_v) { + managed_mem = + sycl::usm::alloc::shared == + sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context()); + } + if (managed_mem) { + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_asum_impl(localSize), 32, true>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 256; + return blas::internal::_asum_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } else { - constexpr int localSize = 512; - constexpr index_t number_WG = 256; - return blas::internal::_asum_impl( - sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + if (_N < (1 << 18)) { + constexpr index_t localSize = 1024; + const index_t number_WG = (_N + localSize - 1) / localSize; + return blas::internal::_asum_impl(localSize), 32, false>( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } else { + constexpr int localSize = 512; + constexpr index_t number_WG = 256; + return blas::internal::_asum_impl( + sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies); + } } } } // namespace backend diff --git a/src/interface/blas1_interface.hpp b/src/interface/blas1_interface.hpp index df914b71b..7f6ee962e 100644 --- a/src/interface/blas1_interface.hpp +++ b/src/interface/blas1_interface.hpp @@ -226,9 +226,9 @@ typename sb_handle_t::event_t _asum( * implementation use a kernel implementation which doesn't * require local memory. */ -template +template typename sb_handle_t::event_t _asum_impl( sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx, container_1_t _rs, const index_t number_WG, @@ -238,7 +238,8 @@ typename sb_handle_t::event_t _asum_impl( auto rs = make_vector_view(_rs, static_cast(1), static_cast(1)); typename sb_handle_t::event_t ret; - auto asumOp = make_wg_atomic_reduction(rs, vx); + auto asumOp = + make_wg_atomic_reduction(rs, vx); if constexpr (localMemSize != 0) { ret = sb_handle.execute(asumOp, static_cast(localSize), static_cast(number_WG * localSize), diff --git a/src/operations/blas1/WGAtomicReduction.hpp b/src/operations/blas1/WGAtomicReduction.hpp index 22d923e8d..779c33b43 100644 --- a/src/operations/blas1/WGAtomicReduction.hpp +++ b/src/operations/blas1/WGAtomicReduction.hpp @@ -35,26 +35,30 @@ namespace blas { * and atomics operation to combine the results. * * */ -template -WGAtomicReduction::WGAtomicReduction(lhs_t& _l, - rhs_t& _r) +template +WGAtomicReduction::WGAtomicReduction( + lhs_t& _l, rhs_t& _r) : lhs_(_l), rhs_(_r){}; -template -PORTBLAS_INLINE typename WGAtomicReduction::index_t -WGAtomicReduction::get_size() const { +template +PORTBLAS_INLINE + typename WGAtomicReduction::index_t + WGAtomicReduction::get_size() const { return rhs_.get_size(); } -template -PORTBLAS_INLINE bool WGAtomicReduction::valid_thread( +template +PORTBLAS_INLINE bool +WGAtomicReduction::valid_thread( cl::sycl::nd_item<1> ndItem) const { return true; } -template -PORTBLAS_INLINE typename WGAtomicReduction::value_t -WGAtomicReduction::eval(cl::sycl::nd_item<1> ndItem) { +template +PORTBLAS_INLINE + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( + cl::sycl::nd_item<1> ndItem) { auto atomic_res = cl::sycl::atomic_ref::eval(cl::sycl::nd_item<1> ndItem) { } return {}; } -template + +template template -PORTBLAS_INLINE typename WGAtomicReduction::value_t -WGAtomicReduction::eval(sharedT scratch, - cl::sycl::nd_item<1> ndItem) { - auto atomic_res = - cl::sycl::atomic_ref( - lhs_.get_data()[0]); +PORTBLAS_INLINE + typename WGAtomicReduction::value_t + WGAtomicReduction::eval( + sharedT scratch, cl::sycl::nd_item<1> ndItem) { const auto size = get_size(); const int lid = static_cast(ndItem.get_global_linear_id()); const auto loop_stride = @@ -119,22 +120,36 @@ WGAtomicReduction::eval(sharedT scratch, cl::sycl::plus()); } if (ndItem.get_local_id()[0] == 0) { - atomic_res += val; + if constexpr (!managed_mem) { + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); + atomic_res += val; + } else { + auto atomic_res = + cl::sycl::atomic_ref( + lhs_.get_data()[0]); + atomic_res += val; + } } return {}; } -template -PORTBLAS_INLINE void WGAtomicReduction::bind( - cl::sycl::handler& h) { +template +PORTBLAS_INLINE void WGAtomicReduction::bind(cl::sycl::handler& h) { lhs_.bind(h); rhs_.bind(h); } -template -PORTBLAS_INLINE void -WGAtomicReduction::adjust_access_displacement() { +template +PORTBLAS_INLINE void WGAtomicReduction::adjust_access_displacement() { lhs_.adjust_access_displacement(); rhs_.adjust_access_displacement(); }