From df00f2d10c05337e26277824dea9b80815418d5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= <69756491+Xewar313@users.noreply.github.com> Date: Tue, 17 Sep 2024 12:10:49 +0200 Subject: [PATCH 1/3] Fix warning and compilation errors (#1849) --- include/oneapi/dpl/pstl/execution_impl.h | 8 ++++---- .../dpcpp/parallel_backend_sycl_radix_sort_one_wg.h | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/execution_impl.h b/include/oneapi/dpl/pstl/execution_impl.h index 79ed46570bf..c6f32f17495 100644 --- a/include/oneapi/dpl/pstl/execution_impl.h +++ b/include/oneapi/dpl/pstl/execution_impl.h @@ -106,25 +106,25 @@ __select_backend(oneapi::dpl::execution::parallel_unsequenced_policy, _IteratorT namespace __ranges { -::oneapi::dpl::__internal::__serial_tag +inline ::oneapi::dpl::__internal::__serial_tag __select_backend(oneapi::dpl::execution::sequenced_policy) { return {}; } -::oneapi::dpl::__internal::__serial_tag //vectorization allowed +inline ::oneapi::dpl::__internal::__serial_tag //vectorization allowed __select_backend(oneapi::dpl::execution::unsequenced_policy) { return {}; } -::oneapi::dpl::__internal::__parallel_tag +inline ::oneapi::dpl::__internal::__parallel_tag __select_backend(oneapi::dpl::execution::parallel_policy) { return {}; } -::oneapi::dpl::__internal::__parallel_tag //vectorization allowed +inline ::oneapi::dpl::__internal::__parallel_tag //vectorization allowed __select_backend(oneapi::dpl::execution::parallel_unsequenced_policy) { return {}; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index fbf80582d43..7f133e54da1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -164,9 +164,12 @@ struct __subgroup_radix_sort auto __counter_lacc = __buf_count.get_acc(__cgh); __cgh.parallel_for<_Name...>( - __range, - ([=](sycl::nd_item<1> __it)[[_ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(__req_sub_group_size)]] { - union __storage { _ValT __v[__block_size]; __storage(){} } __values; + __range, ([=](sycl::nd_item<1> __it) [[_ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(16)]] { + union __storage + { + _ValT __v[__block_size]; + __storage() {} + } __values; uint16_t __wi = __it.get_local_linear_id(); uint16_t __begin_bit = 0; constexpr uint16_t __end_bit = sizeof(_KeyT) * ::std::numeric_limits::digits; From 4c1baaaa68f8d5a74e649dda5a366531d79ff15e Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Fri, 13 Sep 2024 10:44:31 -0500 Subject: [PATCH 2/3] Always use size 16 sub-groups in single work-group radix sort if supported (#1833) * Only use sub-group sizes of 16 in one-wg radix sort This change is added to avoid a bug where IGC cannot compile SIMD32 kernels with -O0 compilation flags. No performance impact is observed. Signed-off-by: Matthew Michel --------- Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_radix_sort.h | 10 +++++----- .../dpcpp/parallel_backend_sycl_radix_sort_one_wg.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h index 8c61bb9de0a..a220b3c29ff 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h @@ -816,11 +816,11 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); - // In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED - // based upon the iters per item. For the below cases, register spills that result in runtime exceptions have - // been observed on accelerators that do not support the requested sub-group size of 16. For the above cases - // that request but may not receive a sub-group size of 16, inputs are small enough to avoid register - // spills on assessed hardware. + // In __subgroup_radix_sort, we request a sub-group size of 16 via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED + // for compilation targets that support this option. For the below cases, register spills that result in + // runtime exceptions have been observed on accelerators that do not support the requested sub-group size of 16. + // For the above cases that request but may not receive a sub-group size of 16, inputs are small enough to avoid + // register spills on assessed hardware. else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16) __event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}( __exec.queue(), ::std::forward<_Range>(__in_rng), __proj); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index 7f133e54da1..d6534ad1afd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -30,8 +30,7 @@ template class __radix_sort_one_wg_kernel; template + std::uint32_t __radix = 4, bool __is_asc = true> struct __subgroup_radix_sort { template @@ -147,6 +146,7 @@ struct __subgroup_radix_sort auto operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter) { + constexpr std::uint16_t __req_sub_group_size = 16; uint16_t __n = __src.size(); assert(__n <= __block_size * __wg_size); From 8af77f2531297f16baf54afff5ad46529017f81f Mon Sep 17 00:00:00 2001 From: timmiesmith Date: Thu, 21 Nov 2024 13:12:12 -0600 Subject: [PATCH 3/3] Remove unused variable. --- .../pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h index d6534ad1afd..6dd3b193a08 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort_one_wg.h @@ -146,7 +146,6 @@ struct __subgroup_radix_sort auto operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter) { - constexpr std::uint16_t __req_sub_group_size = 16; uint16_t __n = __src.size(); assert(__n <= __block_size * __wg_size);