Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle warp configurations for 2D and 3D blocks #12

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions trove/aos.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ struct use_direct {
template<typename T>
__device__ typename enable_if<detail::use_shfl<T>::value, T>::type
load_warp_contiguous(const T* src) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
const T* warp_begin_src = src - warp_id;
typedef typename detail::dismember_type<T>::type U;
const U* as_int_src = (const U*)warp_begin_src;
Expand All @@ -85,7 +85,7 @@ load_warp_contiguous(const T* src) {
template<typename T>
__device__ typename enable_if<detail::use_shfl<T>::value>::type
store_warp_contiguous(const T& data, T* dest) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
T* warp_begin_dest = dest - warp_id;
typedef typename detail::dismember_type<T>::type U;
U* as_int_dest = (U*)warp_begin_dest;
Expand Down Expand Up @@ -134,7 +134,7 @@ __device__ void update_indices(int& div, int& mod) {
}
div += address_constants<T>::div_offset;
}


template<int s, typename T>
struct indexed_load {
Expand All @@ -145,8 +145,8 @@ struct indexed_load {
U* address = compute_address(src, div, mod);
result = *address;
update_indices<T>(div, mod);


return array<U, s>(
result,
indexed_load<s-1, T>::impl(src, div, mod));
Expand Down Expand Up @@ -207,7 +207,7 @@ bool is_contiguous(int warp_id, const T* ptr) {
template<typename T>
__device__ typename enable_if<use_shfl<T>::value, T>::type
load_dispatch(const T* src) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
// if (detail::is_contiguous(warp_id, src)) {
// return detail::load_warp_contiguous(src);
// } else {
Expand All @@ -220,7 +220,7 @@ load_dispatch(const T* src) {
warp_id % address_constants<T>::m);
r2c_warp_transpose(loaded);
return detail::fuse<T>(loaded);
// }
// }
}


Expand All @@ -235,7 +235,7 @@ load_dispatch(const T* src) {
template<typename T>
__device__ typename enable_if<use_shfl<T>::value>::type
store_dispatch(const T& data, T* dest) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
// if (detail::is_contiguous(warp_id, dest)) {
// detail::store_warp_contiguous(data, dest);
// } else {
Expand All @@ -256,7 +256,7 @@ store_dispatch(const T& data, T* dest) {
detail::divergent_store(data, dest);
}


}

template<typename T>
Expand Down
58 changes: 30 additions & 28 deletions trove/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ struct c2r_offset_constants<m, odd> {

template<int m>
struct c2r_offset_constants<m, power_of_two> {
static const int offset = WARP_SIZE - WARP_SIZE/m;
static const int offset = WARP_SIZE - WARP_SIZE/m;
static const int permute = m - 1;
};

Expand Down Expand Up @@ -219,7 +219,8 @@ template<int m>
struct c2r_compute_initial_offset<m, odd> {
typedef c2r_offset_constants<m> constants;
__device__ static int impl() {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;

int initial_offset = ((WARP_SIZE - warp_id) * constants::offset)
& WARP_MASK;
return initial_offset;
Expand All @@ -229,7 +230,7 @@ struct c2r_compute_initial_offset<m, odd> {
template<int m>
struct c2r_compute_initial_offset<m, power_of_two> {
__device__ static int impl() {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int initial_offset = ((warp_id * (WARP_SIZE + 1)) >>
static_log<m>::value)
& WARP_MASK;
Expand All @@ -243,7 +244,8 @@ struct r2c_compute_initial_offset {};
template<int m>
struct r2c_compute_initial_offset<m, odd> {
__device__ static int impl() {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;

int initial_offset = (warp_id * m) & WARP_MASK;
return initial_offset;
}
Expand All @@ -262,7 +264,7 @@ array<int, m> c2r_compute_offsets() {

template<typename T, int m, int p = 0>
struct c2r_compute_composite_offsets{};

template<int s, int m, int p>
struct c2r_compute_composite_offsets<array<int, s>, m, p> {
static const int n = WARP_SIZE;
Expand All @@ -284,7 +286,7 @@ struct c2r_compute_composite_offsets<array<int, s>, m, p> {
result_type(offset,
c2r_compute_composite_offsets<array<int, s-1>, m, p+1>
::impl(new_idx, col));

}
};

Expand All @@ -304,11 +306,11 @@ struct c2r_compute_composite_offsets<array<int, 1>, m, p> {
int offset = ((((idx >> log_c) * k) & mod_n_div_c) +
((idx & mod_c) << log_n_div_c)) & mod_n;
return result_type(offset);

}
};


template<int index, int offset, int bound>
struct r2c_offsets {
static const int value = (offset * index) % bound;
Expand Down Expand Up @@ -365,7 +367,7 @@ struct r2c_compute_offsets_impl<array<int, 1>, index, m, power_of_two> {

template<typename T, int m>
struct r2c_compute_composite_offsets{};

template<int s, int m>
struct r2c_compute_composite_offsets<array<int, s>, m> {
static const int n = WARP_SIZE;
Expand All @@ -381,7 +383,7 @@ struct r2c_compute_composite_offsets<array<int, s>, m> {
result_type(offset & mod_n,
r2c_compute_composite_offsets<array<int, s-1>, m>
::impl(col, new_offset, lb, ub));

}
};

Expand All @@ -395,7 +397,7 @@ struct r2c_compute_composite_offsets<array<int, 1>, m> {
typedef array<int, 1> result_type;
__host__ __device__ static result_type impl(int col, int offset, int lb, int ub) {
return result_type(offset & mod_n);

}
};

Expand All @@ -408,8 +410,8 @@ array<int, m> r2c_compute_offsets() {
return r2c_compute_offsets_impl<result_type,
0, m, Schema>::impl(initial_offset);
}


template<typename Data, typename Indices>
struct warp_shuffle {};

Expand Down Expand Up @@ -447,7 +449,7 @@ template<typename Array>
struct c2r_compute_indices_impl<Array, odd> {
__device__ static void impl(Array& indices, int& rotation) {
indices = detail::c2r_compute_offsets<Array::size, odd>();
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int size = Array::size;
int r = detail::c2r_offset_constants<Array::size>::rotate;
rotation = (warp_id * r) % size;
Expand All @@ -458,7 +460,7 @@ template<typename Array>
struct c2r_compute_indices_impl<Array, power_of_two> {
__device__ static void impl(Array& indices, int& rotation) {
indices = detail::c2r_compute_offsets<Array::size, power_of_two>();
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int size = Array::size;
rotation = (size - warp_id) & (size - 1);
}
Expand All @@ -467,8 +469,8 @@ struct c2r_compute_indices_impl<Array, power_of_two> {
template<typename Array>
struct c2r_compute_indices_impl<Array, composite> {
__device__ static void impl(Array& indices, int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;

indices = detail::c2r_compute_composite_offsets<Array, Array::size>::
impl(warp_id, warp_id);
rotation = warp_id % Array::size;
Expand All @@ -493,11 +495,11 @@ struct c2r_warp_transpose_impl<Array, Indices, power_of_two> {
__device__ static void impl(Array& src,
const Indices& indices,
const int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int pre_rotation = warp_id >>
(LOG_WARP_SIZE -
static_log<Array::size>::value);
src = rotate(src, pre_rotation);
src = rotate(src, pre_rotation);
c2r_warp_transpose_impl<Array, Indices, odd>::impl
(src, indices, rotation);
}
Expand All @@ -508,7 +510,7 @@ struct c2r_warp_transpose_impl<Array, Indices, composite> {
__device__ static void impl(Array& src,
const Indices& indices,
const int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int pre_rotation = warp_id >> static_log<WARP_SIZE/static_gcd<Array::size, WARP_SIZE>::value>::value;
src = rotate(src, pre_rotation);
detail::warp_shuffle<Array, Indices>::impl(src, indices);
Expand All @@ -525,7 +527,7 @@ struct r2c_compute_indices_impl<Array, odd> {
__device__ static void impl(Array& indices, int& rotation) {
indices =
detail::r2c_compute_offsets<Array::size, odd>();
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int size = Array::size;
int r =
size - detail::r2c_offset_constants<Array::size>::permute;
Expand All @@ -544,7 +546,7 @@ struct r2c_compute_indices_impl<Array, power_of_two> {
static const int n_div_m = WARP_SIZE / m;
static const int log_n_div_m = static_log<n_div_m>::value;
__device__ static void impl(Array& indices, int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
int size = Array::size;
rotation = warp_id % size;
int initial_offset = ((warp_id << log_m) + (warp_id >> log_n_div_m)) & mod_n;
Expand All @@ -559,13 +561,13 @@ struct r2c_compute_indices_impl<Array, composite> {
static const int size = Array::size;
static const int c = static_gcd<size, WARP_SIZE>::value;
__device__ static void impl(Array& indices, int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
rotation = size - (warp_id % size);
int lb = (size * warp_id) & WARP_MASK;
int ub = lb + size;
int offset = lb + warp_id / (WARP_SIZE/c);
indices = detail::r2c_compute_composite_offsets<Array, Array::size>::
impl(warp_id, offset, lb, ub);
impl(warp_id, offset, lb, ub);
}
};

Expand All @@ -591,7 +593,7 @@ struct r2c_warp_transpose_impl<Array, Indices, power_of_two> {
Array rotated = rotate(src, rotation);
detail::warp_shuffle<Array, Indices>::impl(rotated, indices);
const int size = Array::size;
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
src = rotate(detail::r2c_tx_permute(rotated),
(size-warp_id/(WARP_SIZE/size))%size);
}
Expand All @@ -604,7 +606,7 @@ struct r2c_warp_transpose_impl<Array, Indices, composite> {
__device__ static void impl(Array& src,
const Indices& indices,
const int& rotation) {
int warp_id = threadIdx.x & WARP_MASK;
int warp_id = thread_id() & WARP_MASK;
src = composite_r2c_tx_permute(src);
src = rotate(src, rotation);
detail::warp_shuffle<Array, Indices>::impl(src, indices);
Expand All @@ -621,7 +623,7 @@ __device__ void c2r_compute_indices(array<int, i>& indices, int& rotation) {
Array,
typename detail::tx_algorithm<i>::type>
::impl(indices, rotation);

}

template<typename T, int i>
Expand Down Expand Up @@ -676,7 +678,7 @@ __device__ void r2c_warp_transpose(array<T, i>& src) {
indices_array indices;
int rotation;
r2c_compute_indices(indices, rotation);

detail::r2c_warp_transpose_impl<
Array, array<int, i>,
typename detail::tx_algorithm<i>::type>
Expand Down
4 changes: 4 additions & 0 deletions trove/warp.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ enum {
LOG_WARP_SIZE = 5
};

inline __device__ int thread_id() {
return (threadIdx.z*blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
}

__device__
inline bool warp_converged() {
#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
Expand Down