Skip to content

Commit

Permalink
Add check for managed usm allocation for AMD
Browse files Browse the repository at this point in the history
AMD atomic operation implementation requires some specific hardware to
work properly with current reduction kernel. This patch adds a check for
AMD only to provides the correct result even if the specific hardware is
not available.

Signed-off-by: nscipione <[email protected]>
  • Loading branch information
s-Nick committed Mar 21, 2024
1 parent eae3b23 commit a532beb
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 48 deletions.
6 changes: 3 additions & 3 deletions include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ typename sb_handle_t::event_t _asum(
* \brief Prototype for the internal implementation of the ASUM operation. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem = false,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _asum_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand Down
11 changes: 6 additions & 5 deletions include/operations/blas1_trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ struct AssignReduction {
* function below.
*
*/
template <typename operator_t, typename lhs_t, typename rhs_t>
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
struct WGAtomicReduction {
using value_t = typename lhs_t::value_t;
using index_t = typename rhs_t::index_t;
Expand Down Expand Up @@ -304,10 +304,11 @@ inline AssignReduction<operator_t, lhs_t, rhs_t> make_assign_reduction(
lhs_, rhs_, local_num_thread_, global_num_thread_);
}

template <typename operator_t, typename lhs_t, typename rhs_t>
inline WGAtomicReduction<operator_t, lhs_t, rhs_t> make_wg_atomic_reduction(
lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, lhs_t, rhs_t>(lhs_, rhs_);
template <typename operator_t, bool managed_mem = false, typename lhs_t,
typename rhs_t>
inline WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>
make_wg_atomic_reduction(lhs_t &lhs_, rhs_t &rhs_) {
return WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>(lhs_, rhs_);
}

template <bool is_max, bool is_step0, typename lhs_t, typename rhs_t>
Expand Down
47 changes: 38 additions & 9 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,45 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _asum(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
/**
* This compile time check is absolutely necessary for AMD gpu.
* AMD atomic operations required a specific combination of hardware that we
*cannot check neither enforce to users. Since reduction operators kernel
*implementation useses atomic operation without that particular combination
*the operator may fail silently. This check enforce a different atomic
*address space causing a big performance degradation, but making the kernel
*behaves correctly also with managed memory (aka malloc_shared allocation).
**/
bool managed_mem{false};
if constexpr (std::is_pointer_v<decltype(_rs)>) {
managed_mem =
sycl::usm::alloc::shared ==
sycl::get_pointer_type(_rs, sb_handle.get_queue().get_context());
}
if (managed_mem) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, true>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_asum_impl<static_cast<int>(localSize), 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 256;
return blas::internal::_asum_impl<localSize, 32, false>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
}
} // namespace backend
Expand Down
9 changes: 5 additions & 4 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,9 @@ typename sb_handle_t::event_t _asum(
* implementation use a kernel implementation which doesn't
* require local memory.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
template <int localSize, int localMemSize, bool usmManagedMem,
typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _asum_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
Expand All @@ -238,7 +238,8 @@ typename sb_handle_t::event_t _asum_impl(
auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
static_cast<index_t>(1));
typename sb_handle_t::event_t ret;
auto asumOp = make_wg_atomic_reduction<AbsoluteAddOperator>(rs, vx);
auto asumOp =
make_wg_atomic_reduction<AbsoluteAddOperator, usmManagedMem>(rs, vx);
if constexpr (localMemSize != 0) {
ret = sb_handle.execute(asumOp, static_cast<index_t>(localSize),
static_cast<index_t>(number_WG * localSize),
Expand Down
69 changes: 42 additions & 27 deletions src/operations/blas1/WGAtomicReduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,30 @@ namespace blas {
* and atomics operation to combine the results.
*
* */
template <typename operator_t, typename lhs_t, typename rhs_t>
WGAtomicReduction<operator_t, lhs_t, rhs_t>::WGAtomicReduction(lhs_t& _l,
rhs_t& _r)
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::WGAtomicReduction(
lhs_t& _l, rhs_t& _r)
: lhs_(_l), rhs_(_r){};

template <typename operator_t, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::index_t
WGAtomicReduction<operator_t, lhs_t, rhs_t>::get_size() const {
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE
typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::index_t
WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::get_size() const {
return rhs_.get_size();
}

template <typename operator_t, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE bool WGAtomicReduction<operator_t, lhs_t, rhs_t>::valid_thread(
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE bool
WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::valid_thread(
cl::sycl::nd_item<1> ndItem) const {
return true;
}

template <typename operator_t, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE
typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
cl::sycl::nd_item<1> ndItem) {
auto atomic_res =
cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
cl::sycl::memory_scope::device,
Expand All @@ -80,16 +84,13 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(cl::sycl::nd_item<1> ndItem) {
}
return {};
}
template <typename operator_t, typename lhs_t, typename rhs_t>

template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
template <typename sharedT>
PORTBLAS_INLINE typename WGAtomicReduction<operator_t, lhs_t, rhs_t>::value_t
WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
cl::sycl::nd_item<1> ndItem) {
auto atomic_res =
cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
cl::sycl::memory_scope::device,
cl::sycl::access::address_space::global_space>(
lhs_.get_data()[0]);
PORTBLAS_INLINE
typename WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::value_t
WGAtomicReduction<operator_t, managed_mem, lhs_t, rhs_t>::eval(
sharedT scratch, cl::sycl::nd_item<1> ndItem) {
const auto size = get_size();
const int lid = static_cast<int>(ndItem.get_global_linear_id());
const auto loop_stride =
Expand Down Expand Up @@ -119,22 +120,36 @@ WGAtomicReduction<operator_t, lhs_t, rhs_t>::eval(sharedT scratch,
cl::sycl::plus<value_t>());
}
if (ndItem.get_local_id()[0] == 0) {
atomic_res += val;
if constexpr (!managed_mem) {
auto atomic_res =
cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
cl::sycl::memory_scope::device,
cl::sycl::access::address_space::global_space>(
lhs_.get_data()[0]);
atomic_res += val;
} else {
auto atomic_res =
cl::sycl::atomic_ref<value_t, cl::sycl::memory_order::relaxed,
cl::sycl::memory_scope::device,
cl::sycl::access::address_space::generic_space>(
lhs_.get_data()[0]);
atomic_res += val;
}
}

return {};
}

template <typename operator_t, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE void WGAtomicReduction<operator_t, lhs_t, rhs_t>::bind(
cl::sycl::handler& h) {
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
rhs_t>::bind(cl::sycl::handler& h) {
lhs_.bind(h);
rhs_.bind(h);
}

template <typename operator_t, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE void
WGAtomicReduction<operator_t, lhs_t, rhs_t>::adjust_access_displacement() {
template <typename operator_t, bool managed_mem, typename lhs_t, typename rhs_t>
PORTBLAS_INLINE void WGAtomicReduction<operator_t, managed_mem, lhs_t,
rhs_t>::adjust_access_displacement() {
lhs_.adjust_access_displacement();
rhs_.adjust_access_displacement();
}
Expand Down

0 comments on commit a532beb

Please sign in to comment.