Skip to content

Commit

Permalink
Inline fcoeffs into make_project and fix compile errors
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph Schuchart <[email protected]>
  • Loading branch information
devreal committed Jul 26, 2024
1 parent 0f6df6f commit 9f2c8ed
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 31 deletions.
16 changes: 8 additions & 8 deletions examples/madness/mra-device/functionnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ namespace mra {
}

template <typename T>
void distancesq(const Coordinate<T,3>& p, const TensorView<T,2>& q, T* rsq, std::size_t N) {
void distancesq(const Coordinate<T,3>& p, const TensorView<T,1>& q, T* rsq, std::size_t N) {
const T x = p(0);
#ifdef __CUDA_ARCH__
int tid = threadIdx.x*blockDim.x + threadIdx.y;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
T xx = q(0,i) - x;
rsq[i] = xx*xx;
}
Expand All @@ -73,8 +73,8 @@ namespace mra {
const T x = p(0);
const T y = p(1);
#ifdef __CUDA_ARCH__
int tid = threadIdx.x*blockDim.x + threadIdx.y;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
T xx = q(0,i) - x;
T yy = q(1,i) - y;
rsq[i] = xx*xx + yy*yy;
Expand All @@ -89,13 +89,13 @@ namespace mra {
}

template <typename T>
void distancesq(const Coordinate<T,3>& p, const TensorView<T,2>& q, T* rsq, std::size_t N) {
void distancesq(const Coordinate<T,3>& p, const TensorView<T,3>& q, T* rsq, std::size_t N) {
const T x = p(0);
const T y = p(1);
const T z = p(2);
#ifdef __CUDA_ARCH__
int tid = threadIdx.x*blockDim.x + threadIdx.y;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y) {
int tid = threadDim.x * ((threadDim.y*threadIdx.z) + threadIdx.y) + threadIdx.x;
for (size_t i = tid; i < N; i += blockDim.x*blockDim.y*blockDim.z) {
T xx = q(0,i) - x;
T yy = q(1,i) - y;
T zz = q(2,i) - z;
Expand Down
22 changes: 11 additions & 11 deletions examples/madness/mra-device/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<1>) {
__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<1>) {
/* uses threads in 3 dimensions */
xvec = x;
/* TensorView assignment synchronizes */
}

/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<2>) {
__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<2>) {
const std::size_t K = x.dim(1);
if (threadId.z == 0) {
for (size_t i=blockIdx.y; i<K; i += blockDim.y) {
Expand All @@ -32,7 +32,7 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::

/// Make outer product of quadrature points for vectorized algorithms
template<typename T>
__device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::integral_constant<3>) {
__device__ void make_xvec(const mra::TensorView<T,2>& x, mra::TensorView<T,2>& xvec, std::integral_constant<3>) {
const std::size_t K = x.dim(1);
for (size_t i=threadIdx.z; i<K; i += blockDim.z) {
for (size_t j=blockIdx.y; j<K; j += blockDim.y) {
Expand All @@ -48,17 +48,17 @@ __device__ void make_xvec(const TensorView<T,2>& x, TensorView<T,2>& xvec, std::
}


template <typename functorT, typename T, Dimension NDIM>
template <typename functorT, typename T, mra::Dimension NDIM>
__device__
void fcube(const functorT& f,
const Key<NDIM>& key,
const mra::Key<NDIM>& key,
const T thresh,
// output
TensorView<T,2>& values,
mra::TensorView<T,2>& values,
std::size_t K,
// temporaries
TensorView<T, NDIM> x,
TensorView<T, 2> xvec) {
mra::TensorView<T, NDIM> x,
mra::TensorView<T, 2> xvec) {
if (is_negligible(f,Domain<NDIM>:: template bounding_box<T>(key),truncate_tol(key,thresh))) {
values = 0.0;
/* TensorView assigment synchronizes */
Expand Down Expand Up @@ -149,7 +149,7 @@ std::array<mra::Slice, NDIM> get_child_slice(mra::Key<NDIM> key, std::size_t K,
return slices;
}

template<typename Fn, typename T, std::size_t NDIM>
template<typename Fn, typename T, mra::Dimension NDIM>
__global__ fcoeffs_kernel1(
const Fn f,
mra::Key<NDIM> key,
Expand Down Expand Up @@ -180,7 +180,7 @@ __global__ fcoeffs_kernel1(
}
}

template<typename T, std::size_t NDIM>
template<typename T, mra::Dimension NDIM>
__global__ fcoeffs_kernel2(
mra::Key<NDIM> key,
T* coeffs_ptr,
Expand Down Expand Up @@ -217,7 +217,7 @@ __global__ fcoeffs_kernel2(
}
}

template<typename Fn, typename T, std::size_t NDIM>
template<typename Fn, typename T, mra::Dimension NDIM>
void submit_fcoeffs_kernel(
const Fn* fn,
const mra::Key<NDIM>& key,
Expand Down
4 changes: 2 additions & 2 deletions examples/madness/mra-device/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ typedef int cudaStream;

/* Returns the total size of temporary memory needed for
* the project() kernel. */
template<std::size_t NDIM>
template<mra::Dimension NDIM>
std::size_t project_tmp_size(std::size_t K) {
const size_t K2NDIM = std::pow(K,NDIM);
return (NDIM*K2NDIM) // xvec in fcube
+ (5*K2NDIM); // x in fcube, workspace in transform, values, child_values, r
}

/* Explicitly instantiated for 1, 2, 3 dimensional Gaussians */
template<typename Fn, typename T, std::size_t NDIM>
template<typename Fn, typename T, mra::Dimension NDIM>
void submit_fcoeffs_kernel(
const Fn* fn,
const mra::Key<NDIM>& key,
Expand Down
46 changes: 45 additions & 1 deletion examples/madness/mra-device/mrattg-device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "functionfunctor.h"
#include "../../mrakey.h"

#if 0
/// Project the scaling coefficients using screening and test norm of difference coeffs. Return true if difference coeffs negligible.
template <typename FnT, typename T, mra::Dimension NDIM>
static bool fcoeffs(
Expand Down Expand Up @@ -60,6 +61,7 @@ static bool fcoeffs(
}
co_return status;
}
#endif // 0

template<typename FnT, typename T, mra::Dimension NDIM>
auto make_project(
Expand Down Expand Up @@ -93,7 +95,49 @@ auto make_project(
}
else {
/* here we actually compute: first select a device */
result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs);
//result.is_leaf = fcoeffs(f, functiondata, key, thresh, coeffs);
/**
* BEGIN FCOEFFS HERE
* TODO: figure out a way to outline this into a function or coroutine
*/

/* global function data */
// TODO: need to make our own FunctionData with dynamic K
const auto& phibar = functiondata.get_phibar();
const auto& hgT = functiondata.get_hgT();

const std::size_t K = coeffs.dim(0);

/* temporaries */
bool is_leaf;
auto is_leaf_scratch = ttg::make_scratch(&is_leaf, ttg::scope::Allocate);
const std::size_t tmp_size = project_tmp_size<NDIM>(K);
T* tmp = new T[tmp_size]; // TODO: move this into make_scratch()
auto tmp_scratch = ttg::make_scratch(tmp, ttg::scope::Allocate, tmp_size);

/* TODO: cannot do this from a function, need to move it into the main task */
co_await ttg::device::select(f, coeffs.buffer(), phibar.buffer(),
hgT.buffer(), tmp_scratch, is_leaf_scratch);
auto coeffs_view = coeffs.current_view();
auto phibar_view = phibar.current_view();
auto hgT_view = hgT.current_view();
T* tmp_device = tmp_scratch.device_ptr();
bool *is_leaf_device = is_leaf_scratch.device_ptr();
FnT* f_ptr = f.current_device_ptr();

/* submit the kernel */
submit_fcoeffs_kernel(f_ptr, key, coeffs_view, phibar_view, hgT_view, tmp_device,
is_leaf_device, ttg::device::current_stream());

/* wait and get is_leaf back */
co_await ttg::device::wait(is_leaf_scratch);
result.is_leaf = is_leaf;
/* todo: is this safe? */
delete[] tmp;
/**
* END FCOEFFS HERE
*/

if (!result.is_leaf) {
std::vector<mra::Key<NDIM>> bcast_keys;
for (auto child : children(key)) bcast_keys.push_back(child);
Expand Down
20 changes: 16 additions & 4 deletions examples/madness/mra-device/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ namespace mra {
using value_type = std::decay_t<T>;
using size_type = std::size_t;
using allocator_type = Allocator;
using view_type = TensorView<T, NDIM>;
using view_type = TensorView<value_type, NDIM>;
using const_view_type = TensorView<std::add_const_t<value_type>, NDIM>;
using buffer_type = ttg::Buffer<value_type, allocator_type>;

static constexpr Dimension ndim() { return NDIM; }
Expand Down Expand Up @@ -75,11 +76,15 @@ namespace mra {
return std::reduce(&m_dims[0], &m_dims[ndim()], 1, std::multiplies<size_type>{});
}

auto get_buffer() {
size_type dim(Dimension dim) const {
return m_dims[dim];
}

auto& buffer() {
return m_buffer;
}

const auto get_buffer() const {
const auto& buffer() const {
return m_buffer;
}

Expand All @@ -91,10 +96,17 @@ namespace mra {
return m_buffer.host_ptr();
}

/* returns a view for the current memory space */
/* returns a view for the current memory space
* TODO: handle const correctness (const Tensor should return a const TensorView)*/
view_type current_view() {
return view_type(m_buffer.current_device_ptr(), m_dims);
}

/* returns a view for the current memory space
* TODO: handle const correctness (const Tensor should return a const TensorView)*/
const_view_type current_view() const {
return const_view_type(m_buffer.current_device_ptr(), m_dims);
}
};

} // namespace mra
Expand Down
11 changes: 6 additions & 5 deletions examples/madness/mra-device/tensorview.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ namespace mra {
class TensorView {
public:
using value_type = std::decay_t<T>;
using const_value_type = std::add_const_t<value_type>;
using size_type = std::size_t;
static constexpr int ndim() { return NDIM; }
static constexpr Dimension ndim() { return NDIM; }
using dims_array_t = std::array<size_type, ndim()>;
static constexpr bool is_tensor() { return true; }

Expand Down Expand Up @@ -157,7 +158,7 @@ namespace mra {

public:
template<typename... Dims>
TensorView(value_type *ptr, Dims... dims)
TensorView(T *ptr, Dims... dims)
: m_dims({dims...})
, m_ptr(ptr)
{
Expand All @@ -169,7 +170,7 @@ namespace mra {
}
}

TensorView(value_type *ptr, const dims_array_t& dims)
TensorView(T *ptr, const dims_array_t& dims)
: m_dims(dims)
, m_ptr(ptr)
{ }
Expand Down Expand Up @@ -197,7 +198,7 @@ namespace mra {
return std::reduce(&m_dims[0], &m_dims[ndim()-1], 1, std::multiplies<size_type>{});
}

size_type dim(size_type d) const {
size_type dim(Dimension d) const {
return m_dims[d];
}

Expand Down Expand Up @@ -273,7 +274,7 @@ namespace mra {

private:
dims_array_t m_dims;
value_type *m_ptr;
T *m_ptr; // may be const or non-const
};


Expand Down

0 comments on commit 9f2c8ed

Please sign in to comment.