From a8b521d5be36445bd8e0728debf40b247a84b55e Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 29 Jan 2019 12:02:15 +0200 Subject: [PATCH] Handle 3D warp ids --- trove/aos.h | 18 +++++++-------- trove/transpose.h | 58 ++++++++++++++++++++++++----------------------- trove/warp.h | 4 ++++ 3 files changed, 43 insertions(+), 37 deletions(-) diff --git a/trove/aos.h b/trove/aos.h index 4bb1fa2..12fe515 100644 --- a/trove/aos.h +++ b/trove/aos.h @@ -65,7 +65,7 @@ struct use_direct { template __device__ typename enable_if::value, T>::type load_warp_contiguous(const T* src) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; const T* warp_begin_src = src - warp_id; typedef typename detail::dismember_type::type U; const U* as_int_src = (const U*)warp_begin_src; @@ -85,7 +85,7 @@ load_warp_contiguous(const T* src) { template __device__ typename enable_if::value>::type store_warp_contiguous(const T& data, T* dest) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; T* warp_begin_dest = dest - warp_id; typedef typename detail::dismember_type::type U; U* as_int_dest = (U*)warp_begin_dest; @@ -134,7 +134,7 @@ __device__ void update_indices(int& div, int& mod) { } div += address_constants::div_offset; } - + template struct indexed_load { @@ -145,8 +145,8 @@ struct indexed_load { U* address = compute_address(src, div, mod); result = *address; update_indices(div, mod); - - + + return array( result, indexed_load::impl(src, div, mod)); @@ -207,7 +207,7 @@ bool is_contiguous(int warp_id, const T* ptr) { template __device__ typename enable_if::value, T>::type load_dispatch(const T* src) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; // if (detail::is_contiguous(warp_id, src)) { // return detail::load_warp_contiguous(src); // } else { @@ -220,7 +220,7 @@ load_dispatch(const T* src) { warp_id % address_constants::m); r2c_warp_transpose(loaded); return detail::fuse(loaded); - // } + // } } @@ -235,7 +235,7 @@ load_dispatch(const T* src) { template __device__ typename enable_if::value>::type store_dispatch(const T& data, T* dest) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; // if (detail::is_contiguous(warp_id, dest)) { // detail::store_warp_contiguous(data, dest); // } else { @@ -256,7 +256,7 @@ store_dispatch(const T& data, T* dest) { detail::divergent_store(data, dest); } - + } template diff --git a/trove/transpose.h b/trove/transpose.h index e9843d1..a527c93 100644 --- a/trove/transpose.h +++ b/trove/transpose.h @@ -67,7 +67,7 @@ struct c2r_offset_constants { template struct c2r_offset_constants { - static const int offset = WARP_SIZE - WARP_SIZE/m; + static const int offset = WARP_SIZE - WARP_SIZE/m; static const int permute = m - 1; }; @@ -219,7 +219,8 @@ template struct c2r_compute_initial_offset { typedef c2r_offset_constants constants; __device__ static int impl() { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; + int initial_offset = ((WARP_SIZE - warp_id) * constants::offset) & WARP_MASK; return initial_offset; @@ -229,7 +230,7 @@ struct c2r_compute_initial_offset { template struct c2r_compute_initial_offset { __device__ static int impl() { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int initial_offset = ((warp_id * (WARP_SIZE + 1)) >> static_log::value) & WARP_MASK; @@ -243,7 +244,8 @@ struct r2c_compute_initial_offset {}; template struct r2c_compute_initial_offset { __device__ static int impl() { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; + int initial_offset = (warp_id * m) & WARP_MASK; return initial_offset; } @@ -262,7 +264,7 @@ array c2r_compute_offsets() { template struct c2r_compute_composite_offsets{}; - + template struct c2r_compute_composite_offsets, m, p> { static const int n = WARP_SIZE; @@ -284,7 +286,7 @@ struct c2r_compute_composite_offsets, m, p> { result_type(offset, c2r_compute_composite_offsets, m, p+1> ::impl(new_idx, col)); - + } }; @@ -304,11 +306,11 @@ struct c2r_compute_composite_offsets, m, p> { int offset = ((((idx >> log_c) * k) & mod_n_div_c) + ((idx & mod_c) << log_n_div_c)) & mod_n; return result_type(offset); - + } }; - + template struct r2c_offsets { static const int value = (offset * index) % bound; @@ -365,7 +367,7 @@ struct r2c_compute_offsets_impl, index, m, power_of_two> { template struct r2c_compute_composite_offsets{}; - + template struct r2c_compute_composite_offsets, m> { static const int n = WARP_SIZE; @@ -381,7 +383,7 @@ struct r2c_compute_composite_offsets, m> { result_type(offset & mod_n, r2c_compute_composite_offsets, m> ::impl(col, new_offset, lb, ub)); - + } }; @@ -395,7 +397,7 @@ struct r2c_compute_composite_offsets, m> { typedef array result_type; __host__ __device__ static result_type impl(int col, int offset, int lb, int ub) { return result_type(offset & mod_n); - + } }; @@ -408,8 +410,8 @@ array r2c_compute_offsets() { return r2c_compute_offsets_impl::impl(initial_offset); } - - + + template struct warp_shuffle {}; @@ -447,7 +449,7 @@ template struct c2r_compute_indices_impl { __device__ static void impl(Array& indices, int& rotation) { indices = detail::c2r_compute_offsets(); - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int size = Array::size; int r = detail::c2r_offset_constants::rotate; rotation = (warp_id * r) % size; @@ -458,7 +460,7 @@ template struct c2r_compute_indices_impl { __device__ static void impl(Array& indices, int& rotation) { indices = detail::c2r_compute_offsets(); - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int size = Array::size; rotation = (size - warp_id) & (size - 1); } @@ -467,8 +469,8 @@ struct c2r_compute_indices_impl { template struct c2r_compute_indices_impl { __device__ static void impl(Array& indices, int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; - + int warp_id = thread_id() & WARP_MASK; + indices = detail::c2r_compute_composite_offsets:: impl(warp_id, warp_id); rotation = warp_id % Array::size; @@ -493,11 +495,11 @@ struct c2r_warp_transpose_impl { __device__ static void impl(Array& src, const Indices& indices, const int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int pre_rotation = warp_id >> (LOG_WARP_SIZE - static_log::value); - src = rotate(src, pre_rotation); + src = rotate(src, pre_rotation); c2r_warp_transpose_impl::impl (src, indices, rotation); } @@ -508,7 +510,7 @@ struct c2r_warp_transpose_impl { __device__ static void impl(Array& src, const Indices& indices, const int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int pre_rotation = warp_id >> static_log::value>::value; src = rotate(src, pre_rotation); detail::warp_shuffle::impl(src, indices); @@ -525,7 +527,7 @@ struct r2c_compute_indices_impl { __device__ static void impl(Array& indices, int& rotation) { indices = detail::r2c_compute_offsets(); - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int size = Array::size; int r = size - detail::r2c_offset_constants::permute; @@ -544,7 +546,7 @@ struct r2c_compute_indices_impl { static const int n_div_m = WARP_SIZE / m; static const int log_n_div_m = static_log::value; __device__ static void impl(Array& indices, int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; int size = Array::size; rotation = warp_id % size; int initial_offset = ((warp_id << log_m) + (warp_id >> log_n_div_m)) & mod_n; @@ -559,13 +561,13 @@ struct r2c_compute_indices_impl { static const int size = Array::size; static const int c = static_gcd::value; __device__ static void impl(Array& indices, int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; rotation = size - (warp_id % size); int lb = (size * warp_id) & WARP_MASK; int ub = lb + size; int offset = lb + warp_id / (WARP_SIZE/c); indices = detail::r2c_compute_composite_offsets:: - impl(warp_id, offset, lb, ub); + impl(warp_id, offset, lb, ub); } }; @@ -591,7 +593,7 @@ struct r2c_warp_transpose_impl { Array rotated = rotate(src, rotation); detail::warp_shuffle::impl(rotated, indices); const int size = Array::size; - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; src = rotate(detail::r2c_tx_permute(rotated), (size-warp_id/(WARP_SIZE/size))%size); } @@ -604,7 +606,7 @@ struct r2c_warp_transpose_impl { __device__ static void impl(Array& src, const Indices& indices, const int& rotation) { - int warp_id = threadIdx.x & WARP_MASK; + int warp_id = thread_id() & WARP_MASK; src = composite_r2c_tx_permute(src); src = rotate(src, rotation); detail::warp_shuffle::impl(src, indices); @@ -621,7 +623,7 @@ __device__ void c2r_compute_indices(array& indices, int& rotation) { Array, typename detail::tx_algorithm::type> ::impl(indices, rotation); - + } template @@ -676,7 +678,7 @@ __device__ void r2c_warp_transpose(array& src) { indices_array indices; int rotation; r2c_compute_indices(indices, rotation); - + detail::r2c_warp_transpose_impl< Array, array, typename detail::tx_algorithm::type> diff --git a/trove/warp.h b/trove/warp.h index 1f31d57..ff7bd3d 100644 --- a/trove/warp.h +++ b/trove/warp.h @@ -36,6 +36,10 @@ enum { LOG_WARP_SIZE = 5 }; +inline __device__ int thread_id() { + return (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x; +} + __device__ inline bool warp_converged() { #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000