Skip to content

Commit

Permalink
MRA: Add print task and have it compile for the host
Browse files Browse the repository at this point in the history
Execution on the host currently won't work because there is no
host-side device support. We need to add the ExecutionSpace to
ttg::device::Task and allow the Host space to be selected there.

Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Sep 3, 2024
1 parent 0da214a commit 852e93b
Show file tree
Hide file tree
Showing 11 changed files with 329 additions and 110 deletions.
9 changes: 9 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ if (TARGET tiledarray)
endif()

add_ttg_executable(tensortest madness/mra-device/tests/tensor_test.cc)
add_ttg_executable(mradevice-host
madness/mra-device/mrattg-device.cc
madness/mra-device/gl.cc
madness/mra-device/mrattg-device.cc
madness/mra-device/kernels.cc
mratwoscale.cc
COMPILE_DEFINITIONS TTG_ENABLE_HOST=1
LINK_LIBRARIES MADworld
RUNTIMES "parsec")
if (TTG_HAVE_CUDA)
add_ttg_executable(mradevice-cuda
madness/mra-device/mrattg-device.cc
Expand Down
13 changes: 13 additions & 0 deletions examples/madness/mra-device/functionnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "key.h"
#include "tensor.h"
#include "functions.h"

namespace mra {
template <typename T, Dimension NDIM>
Expand Down Expand Up @@ -77,6 +78,18 @@ namespace mra {
}
};

template <typename T, Dimension NDIM, typename ostream>
ostream& operator<<(ostream& s, const FunctionReconstructedNode<T,NDIM>& node) {
s << "FunctionReconstructedNode(" << node.key << "," << node.is_leaf << "," << mra::normf(node.coeffs.current_view()) << ")";
return s;
}

template <typename T, Dimension NDIM, typename ostream>
ostream& operator<<(ostream& s, const FunctionCompressedNode<T,NDIM>& node) {
s << "FunctionCompressedNode(" << node.key << "," << mra::normf(node.coeffs.current_view()) << ")";
return s;
}

} // namespace mra

#endif // HAVE_MRA_FUNCTIONNODE_H
47 changes: 47 additions & 0 deletions examples/madness/mra-device/functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,53 @@ namespace mra {
#endif // __CUDA_ARCH__
}

template <typename T, Dimension NDIM, typename accumulatorT>
SCOPE void sumabssq(const TensorView<T, NDIM>& a, accumulatorT* sum) {
#ifdef __CUDA_ARCH__
int tid = threadIdx.x + blockIdx.y + blockIdx.z;
accumulatorT s = 0.0;
/* play it safe: set sum to zero before the atomic increments */
if (tid == 0) { *sum = 0.0; }
/* wait for thread 0 */
SYNCTHREADS();
/* every thread computes a partial sum (likely 1 element only) */
foreach_idx(a, [&](auto... idx) mutable {
accumulatorT x = a(idx...);
s += x*x;
});
/* accumulate thread-partial results into sum
* if we had shared memory we could use that here but for now run with atomics
* NOTE: needs CUDA architecture 6.0 or higher */
atomicAdd_block(sum, s);
SYNCTHREADS();
#else // __CUDA_ARCH__
accumulatorT s = 0.0;
foreach_idx(a, [&](auto... idx) mutable {
accumulatorT x = a(idx...);
s += x*x;
});
*sum = s;
#endif // __CUDA_ARCH__
}


/// Compute Frobenius norm ... still needs specializing for complex
template <typename T, Dimension NDIM, typename accumulatorT = T>
SCOPE accumulatorT normf(const TensorView<T, NDIM>& a) {
#ifdef __CUDA_ARCH__
__shared__ accumulatorT sum;
#else // __CUDA_ARCH__
accumulatorT sum;
#endif // __CUDA_ARCH__
sumabssq<accumulatorT>(a, &sum);
#ifdef __CUDA_ARCH__
/* wait for all threads to contribute */
SYNCTHREADS();
#endif // __CUDA_ARCH__
return std::sqrt(sum);
}


} // namespace mra


Expand Down
18 changes: 18 additions & 0 deletions examples/madness/mra-device/kernels.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include "ttg.h"
#include "util.h"

/* reuse dim3 from CUDA/HIP if available*/
#if !defined(TTG_HAVE_CUDA) && !defined(TTG_HAVE_HIP)
struct dim3 {
int x, y, z;
};
#endif

/* define our own thread layout (single thread) */
static constexpr const dim3 threadIdx = {0, 0, 0};
static constexpr const dim3 blockIdx = {0, 0, 0};
static constexpr const dim3 blockDim = {1, 1, 1};
static constexpr const dim3 gridDim = {1, 1, 1};

/* include the CUDA code and hope that all is well */
#include "kernels.cu"
103 changes: 30 additions & 73 deletions examples/madness/mra-device/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
#include "gl.h"
#include "kernels.h"
#include "functions.h"

//#include "gl.cu"
#include "util.h"

template<typename T>
struct type_printer;
Expand All @@ -17,7 +16,7 @@ using namespace mra;

/// Set X(d,mu) to be the mu'th quadrature point in dimension d for the box described by key
template<typename T, Dimension NDIM>
__device__ void make_quadrature_pts(
SCOPE void make_quadrature_pts(
const Domain<NDIM>& D,
const T* gldata,
const Key<NDIM>& key,
Expand All @@ -31,7 +30,6 @@ __device__ void make_quadrature_pts(
/* retrieve x[] from constant memory, use float */
const T *x, *w;
detail::GLget(gldata, &x, &w, K);
#ifdef __CUDA_ARCH__
if (threadIdx.z == 0) {
for (int d = threadIdx.y; d < X.dim(0); d += blockDim.y) {
T lo, hi; std::tie(lo,hi) = D.get(d);
Expand All @@ -42,16 +40,7 @@ __device__ void make_quadrature_pts(
}
}
/* wait for all to complete */
__syncthreads();
#else // __CUDA_ARCH__
for (Dimension d : range(NDIM)) {
T lo, hi; std::tie(lo,hi) = D.get(d);
T width = h*D.get_width(d);
for (size_t i : range(X.dim(1))) {
X(d,i) = lo + width*(l[d] + x[i]);
}
}
#endif // __CUDA_ARCH__
SYNCTHREADS();
}


Expand All @@ -69,7 +58,7 @@ __device__ void make_quadrature_pts(


template <typename functionT, typename T, Dimension NDIM>
__device__ bool is_negligible(
SCOPE bool is_negligible(
const functionT& f,
const std::pair<Coordinate<T,NDIM>, Coordinate<T,NDIM>>& box,
T thresh)
Expand All @@ -79,39 +68,9 @@ __device__ bool is_negligible(
else return false;
}


template <typename T, Dimension NDIM, typename accumulatorT>
__device__ void sumabssq(const TensorView<T, NDIM>& a, accumulatorT* sum) {
int tid = threadIdx.x + blockIdx.y + blockIdx.z;
accumulatorT s = 0.0;
/* play it safe: set sum to zero before the atomic increments */
if (tid == 0) { *sum = 0.0; }
__syncthreads();
/* every thread computes a partial sum (likely 1 element only) */
foreach_idx(a, [&](auto... idx) mutable {
accumulatorT x = a(idx...);
s += x*x;
});
/* accumulate thread-partial results into sum
* if we had shared memory we could use that here but for now run with atomics
* NOTE: needs CUDA architecture 6.0 or higher */
atomicAdd_block(sum, s);
__syncthreads();
}

/// Compute Frobenius norm ... still needs specializing for complex
template <typename T, Dimension NDIM, typename accumulatorT = T>
__device__ accumulatorT tensor_normf(TensorView<T, NDIM>& a) {
__shared__ accumulatorT sum; // TODO: is that safe?
sumabssq<accumulatorT>(a, &sum);
__syncthreads();
return std::sqrt(sum);
}


/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
SCOPE void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
std::integral_constant<Dimension, 1>) {
/* uses threads in 3 dimensions */
xvec = x;
Expand All @@ -120,7 +79,7 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,

/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
SCOPE void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
std::integral_constant<Dimension, 2>) {
const std::size_t K = x.dim(1);
if (threadIdx.z == 0) {
Expand All @@ -132,12 +91,12 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
}
}
}
__syncthreads();
SYNCTHREADS();
}

/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
SCOPE void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
std::integral_constant<Dimension, 3>) {
const std::size_t K = x.dim(1);
for (size_t i=threadIdx.z; i<K; i += blockDim.z) {
Expand All @@ -150,12 +109,12 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec,
}
}
}
__syncthreads();
SYNCTHREADS();
}


template <typename functorT, typename T, Dimension NDIM>
__device__
SCOPE
void fcube(const Domain<NDIM>& D,
const T* gldata,
const functorT& f,
Expand Down Expand Up @@ -205,13 +164,13 @@ void fcube(const Domain<NDIM>& D,
//throw "how did we get here?";
// TODO: how to handle this?
}
__syncthreads();
SYNCTHREADS();
}
}

/* reference implementation, adapted from madness */
template <typename aT, typename bT, typename cT>
__device__
SCOPE
void mTxmq(std::size_t dimi, std::size_t dimj, std::size_t dimk,
cT* __restrict__ c, const aT* a, const bT* b, std::ptrdiff_t ldb=-1) {
if (ldb == -1) ldb=dimj;
Expand All @@ -230,10 +189,10 @@ void mTxmq(std::size_t dimi, std::size_t dimj, std::size_t dimk,
}
}
}
__syncthreads();
SYNCTHREADS();
}
template <Dimension NDIM, typename T>
__device__
SCOPE
void transform(const TensorView<T,NDIM>& t,
const TensorView<T,2>& c,
TensorView<T,NDIM>& result,
Expand All @@ -253,7 +212,7 @@ void transform(const TensorView<T,NDIM>& t,
}

template<Dimension NDIM>
__device__
SCOPE
std::array<Slice, NDIM> get_child_slice(Key<NDIM> key, std::size_t K, int child) {
std::array<Slice,NDIM> slices;
for (size_t d = 0; d < NDIM; ++d) {
Expand Down Expand Up @@ -287,11 +246,11 @@ __global__ void fcoeffs_kernel1(
auto x = TensorView<T, 2 >(&tmp[TWOK2NDIM+4*K2NDIM + (NDIM*K2NDIM)], NDIM, K);
auto phibar = TensorView<T, 2 >(phibar_ptr, K, K);
/* compute one child per block */
if (blockid < key.num_children) {
Key<NDIM> child = key.child_at(blockid);
for (int bid = blockid; bid < key.num_children; bid += gridDim.x) {
Key<NDIM> child = key.child_at(bid);
fcube(D, gldata, f, child, thresh, child_values, K, x, x_vec);
transform(child_values, phibar, r, workspace);
auto child_slice = get_child_slice<NDIM>(key, K, blockid);
auto child_slice = get_child_slice<NDIM>(key, K, bid);
TensorSlice<TensorView<T, NDIM>> s = values(child_slice);
s = r;
}
Expand Down Expand Up @@ -331,7 +290,7 @@ __global__ void fcoeffs_kernel2(
/* TensorView assignment synchronizes */
if (tid == 0) {
/* TODO: compute the norm across threads */
*is_leaf = (tensor_normf(r) < truncate_tol(key,thresh)); // test norm of difference coeffs
*is_leaf = (mra::normf(r) < truncate_tol(key,thresh)); // test norm of difference coeffs
}
}

Expand Down Expand Up @@ -368,10 +327,10 @@ void submit_fcoeffs_kernel(
}

/* launch one block per child */
fcoeffs_kernel1<<<key.num_children, thread_dims, 0, stream>>>(
CALL_KERNEL(fcoeffs_kernel1, key.num_children, thread_dims, 0, stream)(
D, gldata, fn, key, tmp, phibar_view.data(), K, thresh);
/* launch one block only */
fcoeffs_kernel2<<<1, thread_dims, 0, stream>>>(
CALL_KERNEL(fcoeffs_kernel2, 1, thread_dims, 0, stream)(
D, key, coeffs_view.data(), hgT_view.data(),
tmp, is_leaf_scratch, K, thresh);
}
Expand Down Expand Up @@ -423,19 +382,20 @@ __global__ void compress_kernel(
auto p = TensorView<T,NDIM>(p_ptr, K);
auto hgT = TensorView<T,2>(hgT_ptr, K);
auto workspace = TensorView<T, NDIM>(&tmp[TWOK2NDIM], K);
auto child_slice = get_child_slice<NDIM>(key, K, blockid);
if (is_t0) {
sumsqs[0] = 0.0;
}

if (blockid < key.num_children) {
const TensorView<T, NDIM> in(in_ptrs[blockid], K);
sumabssq(in, &sumsqs[blockid]);
for (int bid = blockid; bid < key.num_children; bid += gridDim.x) {
auto child_slice = get_child_slice<NDIM>(key, K, bid);
const TensorView<T, NDIM> in(in_ptrs[bid], K);
sumabssq(in, &sumsqs[bid]);
s(child_slice) = in;
//filter<T,K,NDIM>(s,d); // Apply twoscale transformation=
transform<NDIM>(s, hgT, r, workspace);
}
if (key.level() > 0 && blockid == 0) {
auto child_slice = get_child_slice<NDIM>(key, K, 0);
p = r(child_slice);
r(child_slice) = 0.0;
sumabssq(r, &sumsqs[key.num_children]); // put result sumsq at the end
Expand Down Expand Up @@ -464,7 +424,7 @@ void submit_compress_kernel(
thread_dims = dim3(K, 1, 1);
}

compress_kernel<<<key.num_children, thread_dims, 0, stream>>>(
CALL_KERNEL(compress_kernel, key.num_children, thread_dims, 0, stream)(
key, p_view.data(), result_view.data(), hgT_view.data(), tmp, sumsqs, in_ptrs, K);
}

Expand Down Expand Up @@ -496,8 +456,6 @@ __global__ void reconstruct_kernel(
std::array<T*, (1<<NDIM) > r_arr,
std::size_t K)
{
int blockid = blockIdx.x;

const size_t K2NDIM = std::pow( K,NDIM);
const size_t TWOK2NDIM = std::pow(2*K,NDIM);
auto node = TensorView<T, NDIM>(node_ptr, 2*K);
Expand All @@ -506,7 +464,7 @@ __global__ void reconstruct_kernel(
auto hg = TensorView<T, 2>(hg_ptr, K);
auto from_parent = TensorView<T, NDIM>(from_parent_ptr, K);

auto child_slice = get_child_slice<NDIM>(key, K, blockid);
auto child_slice = get_child_slice<NDIM>(key, K, 0);
if (key.level() != 0) node(child_slice) = from_parent;

//unfilter<T,K,NDIM>(node.get().coeffs, s);
Expand All @@ -533,6 +491,7 @@ void submit_reconstruct_kernel(
cudaStream_t stream)
{
const std::size_t K = node.dim(0);
/* runs on a single block */
dim3 thread_dims = dim3(1);
if constexpr (NDIM >= 3) {
thread_dims = dim3(K, K, K);
Expand All @@ -541,9 +500,7 @@ void submit_reconstruct_kernel(
} else if constexpr (NDIM == 1) {
thread_dims = dim3(K, 1, 1);
}

/* runs on a single block */
reconstruct_kernel<<<1, thread_dims, 0, stream>>>(
CALL_KERNEL(reconstruct_kernel, 1, thread_dims, 0, stream)(
key, node.data(), tmp, hg.data(), from_parent.data(), r_arr, K);
}

Expand Down
Loading

0 comments on commit 852e93b

Please sign in to comment.