Skip to content

Commit

Permalink
Pass data as pointers to GPU instead of dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
nknk567 committed Feb 12, 2025
1 parent a5c7b0f commit 515f635
Show file tree
Hide file tree
Showing 11 changed files with 109 additions and 58 deletions.
5 changes: 4 additions & 1 deletion physics/Disk/include/accretion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "accretion_gpu.hpp"
#include "buffer_reduce.hpp"
#include "cstone/primitives/accel_switch.hpp"
#include "get_ptr.hpp"

namespace disk
{
Expand All @@ -21,7 +22,9 @@ void computeAccretionCondition(size_t first, size_t last, Dataset& d, StarData&
{
if constexpr (cstone::HaveGpu<typename Dataset::AcceleratorType>{})
{
computeAccretionConditionGPU(first, last, d, star);
computeAccretionConditionGPU(first, last, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"h">(d),
getPtr<"keys">(d), getPtr<"m">(d), getPtr<"vx">(d), getPtr<"vy">(d),
getPtr<"vz">(d), star);
}
else { computeAccretionConditionImpl(first, last, d, star); }
}
Expand Down
21 changes: 15 additions & 6 deletions physics/Disk/include/accretion_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ __global__ void computeAccretionConditionKernel(size_t first, size_t last, const
if (threadIdx.x == 0) { atomicAddRS(device_removed, block_removed); }
}

template<typename Dataset>
void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star)
template<typename Treal, typename Thydro, typename Tkeys, typename Tmass>
void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z,
const Thydro* h, Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy,
const Thydro* vz, StarData& star)
{
cstone::LocalIndex numParticles = last - first;
constexpr unsigned numThreads = 256;
Expand All @@ -96,9 +98,8 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat
checkGpuErrors(cudaMemcpy(removed_device, &star.removed_local, sizeof star.removed_local, cudaMemcpyHostToDevice));

computeAccretionConditionKernel<numThreads><<<numBlocks, numThreads>>>(
first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.h),
rawPtr(d.devData.keys), rawPtr(d.devData.m), rawPtr(d.devData.vx), rawPtr(d.devData.vy), rawPtr(d.devData.vz),
star.position, star.inner_size * star.inner_size, star.removal_limit_h, accreted_device, removed_device);
first, last, x, y, z, h, keys, m, vx, vy, vz, star.position, star.inner_size * star.inner_size,
star.removal_limit_h, accreted_device, removed_device);

checkGpuErrors(cudaDeviceSynchronize());
checkGpuErrors(cudaGetLastError());
Expand All @@ -110,5 +111,13 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat
checkGpuErrors(cudaFree(removed_device));
}

template void computeAccretionConditionGPU(size_t, size_t, sphexa::ParticlesData<cstone::GpuTag>&, StarData&);
#define COMPUTE_ACCRETION_CONDITION_GPU(Treal, Thydro, Tkeys, Tmass) \
template void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, \
const Treal* z, const Thydro* h, Tkeys* keys, const Tmass* m, \
const Thydro* vx, const Thydro* vy, const Thydro* vz, StarData& star);

COMPUTE_ACCRETION_CONDITION_GPU(double, double, size_t, double);
COMPUTE_ACCRETION_CONDITION_GPU(double, float, size_t, double);
COMPUTE_ACCRETION_CONDITION_GPU(double, float, size_t, float);

} // namespace disk
6 changes: 4 additions & 2 deletions physics/Disk/include/accretion_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

namespace disk
{
template<typename Dataset>
void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star);
template<typename Treal, typename Thydro, typename Tkeys, typename Tmass>
void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z,
const Thydro* h, Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy,
const Thydro* vz, StarData& star);
}
17 changes: 11 additions & 6 deletions physics/Disk/include/betaCooling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "betaCooling_gpu.hpp"
#include "cstone/primitives/accel_switch.hpp"
#include "get_ptr.hpp"

namespace disk
{
Expand All @@ -34,8 +35,8 @@ void betaCoolingImpl(size_t first, size_t last, Dataset& d, const StarData& star
}
}

template<typename Dataset, typename StarData>
auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData& star)
template<typename Dataset>
auto duTimestepImpl(size_t first, size_t last, const Dataset& d)
{

using Tu = std::decay_t<decltype(d.u[0])>;
Expand All @@ -46,12 +47,13 @@ auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData&
#pragma omp parallel for reduction(min : duTimestepMin)
for (size_t i = first; i < last; i++)
{
Tt duTimestep = star.K_u * std::abs(d.u[i] / d.du[i]);
Tt duTimestep = std::abs(d.u[i] / d.du[i]);
duTimestepMin = std::min(duTimestepMin, duTimestep);
}
return duTimestepMin;
}

//! @brief Cool the disk with a cooling time proportional to the Keplerian orbital time (around the star)
template<typename Dataset, typename StarData>
void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData& star)
{
Expand All @@ -60,12 +62,15 @@ void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData&
{
if constexpr (cstone::HaveGpu<typename Dataset::AcceleratorType>{})
{
betaCoolingGPU(startIndex, endIndex, d, star);
betaCoolingGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"u">(d),
getPtr<"rho">(d), getPtr<"du">(d), d.g, star);
}
else { betaCoolingImpl(startIndex, endIndex, d, star); }
}
}

//! @brief Compute a maximal time step depending on how fast the internal energy changes
//! (e.g. due to cooling).
template<typename Dataset, typename StarData>
void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star)
{
Expand All @@ -77,9 +82,9 @@ void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star)
{
if constexpr (cstone::HaveGpu<typename Dataset::AcceleratorType>{})
{
star.t_du = duTimestepGPU(startIndex, endIndex, d, star);
star.t_du = star.K_u * duTimestepGPU(startIndex, endIndex, getPtr<"u">(d), getPtr<"du">(d));
}
else { star.t_du = duTimestepImpl(startIndex, endIndex, d, star); }
else { star.t_du = star.K_u * duTimestepImpl(startIndex, endIndex, d); }
}
}

Expand Down
41 changes: 22 additions & 19 deletions physics/Disk/include/betaCooling_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
namespace disk
{

template<typename Tpos, typename Tu, typename Ts, typename Tdu, typename Trho, typename Trho2>
__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z, Tdu* du,
const Tu* u, Ts star_mass, cstone::Vec3<Ts> star_position, Ts beta, Tpos g,
const Trho* rho, Ts u_floor, Trho2 cooling_rho_limit)
template<typename Treal, typename Thydro, typename Ts>
__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z,
const Treal* u, const Thydro* rho, Treal* du, Treal g, Ts star_mass,
cstone::Vec3<Ts> star_position, Ts beta, Ts u_floor, Ts cooling_rho_limit)

{
cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -40,22 +40,26 @@ __global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, c
du[i] += -u[i] * omega / beta;
}

template<typename Dataset, typename StarData>
void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star)
template<typename Treal, typename Thydro>
void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u,
const Thydro* rho, Treal* du, const Treal g, const StarData& star)
{
cstone::LocalIndex numParticles = last - first;
unsigned numThreads = 256;
unsigned numBlocks = (numParticles + numThreads - 1) / numThreads;

betaCoolingGPUKernel<<<numBlocks, numThreads>>>(first, last, rawPtr(d.devData.x), rawPtr(d.devData.y),
rawPtr(d.devData.z), rawPtr(d.devData.du), rawPtr(d.devData.u),
star.m, star.position, star.beta, d.g, rawPtr(d.devData.rho),
star.u_floor, star.cooling_rho_limit);
betaCoolingGPUKernel<<<numBlocks, numThreads>>>(first, last, x, y, z, u, rho, du, g, star.m, star.position,
star.beta, star.u_floor, star.cooling_rho_limit);

checkGpuErrors(cudaDeviceSynchronize());
}

template void betaCoolingGPU(size_t, size_t, sphexa::ParticlesData<cstone::GpuTag>&, const StarData&);
#define BETA_COOLING_GPU(Treal, Thydro) \
template void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, \
const Treal* u, const Thydro* rho, Treal* du, const Treal g, const StarData& star);

BETA_COOLING_GPU(double, double);
BETA_COOLING_GPU(double, float);

template<typename Tu, typename Tdu>
struct AbsDivide
Expand All @@ -66,14 +70,11 @@ struct AbsDivide
}
};

template<typename Dataset, typename StarData>
double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star)
template<typename Treal>
double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du)
{
cstone::LocalIndex numParticles = last - first;

const auto* u = rawPtr(d.devData.u);
const auto* du = rawPtr(d.devData.du);

using Tu = std::decay_t<decltype(*u)>;
using Tdu = std::decay_t<decltype(*du)>;

Expand All @@ -82,10 +83,12 @@ double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData

double init = INFINITY;

return star.K_u *
thrust::transform_reduce(thrust::device, begin, end, AbsDivide<Tu, Tdu>{}, init, thrust::minimum<double>{});
return thrust::transform_reduce(thrust::device, begin, end, AbsDivide<Tu, Tdu>{}, init, thrust::minimum<double>{});
}

template double duTimestepGPU(size_t, size_t, const sphexa::ParticlesData<cstone::GpuTag>&, const StarData&);
#define DU_TIMESTEP_GPU(Treal) \
template double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du);

DU_TIMESTEP_GPU(double);

} // namespace disk
12 changes: 8 additions & 4 deletions physics/Disk/include/betaCooling_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

#pragma once

#include "star_data.hpp"

namespace disk
{
template<typename Dataset, typename StarData>
void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star);

template<typename Dataset, typename StarData>
double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star);
template<typename Treal, typename Thydro>
void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u,
const Thydro* rho, Treal* du, const Treal g, const StarData& star);

template<typename Treal>
double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du);

} // namespace disk
6 changes: 4 additions & 2 deletions physics/Disk/include/computeCentralForce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "cstone/primitives/accel_switch.hpp"
#include "cstone/tree/definitions.h"
#include "computeCentralForce_gpu.hpp"
#include "get_ptr.hpp"

namespace disk
{
Expand All @@ -20,7 +21,7 @@ void computeCentralForceImpl(size_t first, size_t last, Dataset& d, StarData& st
cstone::Vec4<double> force_local{};
const double inner_size2 = star.inner_size * star.inner_size;

#pragma omp declare reduction(add_force : cstone::Vec4 <double> : omp_out = omp_out + omp_in) initializer(omp_priv = {})
#pragma omp declare reduction(add_force : cstone::Vec4<double> : omp_out = omp_out + omp_in) initializer(omp_priv = {})

#pragma omp parallel for reduction(add_force : force_local)
for (size_t i = first; i < last; i++)
Expand Down Expand Up @@ -54,7 +55,8 @@ void computeCentralForce(size_t startIndex, size_t endIndex, Dataset& d, StarDat
{
if constexpr (cstone::HaveGpu<typename Dataset::AcceleratorType>{})
{
computeCentralForceGPU(startIndex, endIndex, d, star);
computeCentralForceGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"ax">(d),
getPtr<"ay">(d), getPtr<"az">(d), getPtr<"m">(d), d.g, star);
}
else { computeCentralForceImpl(startIndex, endIndex, d, star); }
}
Expand Down
37 changes: 22 additions & 15 deletions physics/Disk/include/computeCentralForce_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Created by Noah Kubli on 11.03.2024.
//
#include <cub/cub.cuh>
#include <type_traits>
#include "cuda_runtime.h"

#include "cstone/cuda/cuda_utils.cuh"
Expand All @@ -26,14 +27,14 @@ __device__ void atomicAddVec4(cstone::Vec4<T>* x, const cstone::Vec4<T>& y)
atomicAdd(&(*x)[3], y[3]);
}

template<size_t numThreads, typename Tpos, typename Ta, typename Tm, typename Tsp, typename Tsm, typename Tg,
typename Tis, typename Tf>
__global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z,
Ta* ax, Ta* ay, Ta* az, const Tm* m, const cstone::Vec3<Tsp> star_position,
Tsm sm, Tg g, Tis inner_size2, Tf* force_device)
template<size_t numThreads, typename Treal, typename Tmass, typename Ta, typename Tstar, typename Tforce>
__global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z,
Ta* ax, Ta* ay, Ta* az, const Tmass* m,
const cstone::Vec3<Tstar> star_position, Tstar m_star, Treal g, Tstar inner_size2,
Tforce* force_device)
{
cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x;
Tf force{};
Tforce force{};

if (i >= last) { force = {0., 0., 0., 0.}; }
else
Expand All @@ -45,7 +46,7 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tp
const double dist = sqrt(dist2);
const double dist3 = dist2 * dist;

const double a_strength = 1. / dist3 * sm * g;
const double a_strength = 1. / dist3 * m_star * g;
const double ax_i = -dx * a_strength;
const double ay_i = -dy * a_strength;
const double az_i = -dz * a_strength;
Expand All @@ -59,16 +60,17 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tp
force[3] = -az_i * m[i];
}

typedef cub::BlockReduce<Tf, numThreads> BlockReduce;
typedef cub::BlockReduce<Tforce, numThreads> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;

Tf force_block = BlockReduce(temp_storage).Sum(force);
Tforce force_block = BlockReduce(temp_storage).Sum(force);
__syncthreads();
if (threadIdx.x == 0) { atomicAddVec4(force_device, force_block); }
}

template<typename Dataset, typename StarData>
void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star)
template<typename Treal, typename Thydro, typename Tmass>
void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Thydro* ax,
Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData& star)
{
cstone::LocalIndex numParticles = last - first;
constexpr unsigned numThreads = 256;
Expand All @@ -80,9 +82,7 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta
checkGpuErrors(cudaMemcpy(force_device, &star.force_local, sizeof star.force_local, cudaMemcpyHostToDevice));

computeCentralForceGPUKernel<numThreads><<<numBlocks, numThreads>>>(
first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.ax),
rawPtr(d.devData.ay), rawPtr(d.devData.az), rawPtr(d.devData.m), star.position, star.m, d.g,
star.inner_size * star.inner_size, force_device);
first, last, x, y, z, ax, ay, az, m, star.position, star.m, g, star.inner_size * star.inner_size, force_device);

checkGpuErrors(cudaDeviceSynchronize());
checkGpuErrors(cudaGetLastError());
Expand All @@ -91,5 +91,12 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta
checkGpuErrors(cudaFree(force_device));
}

template void computeCentralForceGPU(size_t, size_t, sphexa::ParticlesData<cstone::GpuTag>&, StarData&);
#define COMPUTE_CENTRAL_FORCE_GPU(Treal, Thydro, Tmass) \
template void computeCentralForceGPU(size_t, size_t, const Treal* x, const Treal* y, const Treal* z, Thydro* ax, \
Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData&);

COMPUTE_CENTRAL_FORCE_GPU(double, double, double);
COMPUTE_CENTRAL_FORCE_GPU(double, float, double);
COMPUTE_CENTRAL_FORCE_GPU(double, float, float);

} // namespace disk
6 changes: 4 additions & 2 deletions physics/Disk/include/computeCentralForce_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
//

#pragma once
#include "star_data.hpp"

namespace disk
{
template<typename Dataset, typename StarData>
void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star);
template<typename Treal, typename Thydro, typename Tmass>
void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Thydro* ax,
Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData& star);
}
14 changes: 14 additions & 0 deletions physics/Disk/include/get_ptr.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//
// Created by Noah Kubli on 12.02.2025.
//

#pragma once

namespace disk
{
template<util::StructuralString field, typename Dataset>
auto* getPtr(Dataset& d)
{
return rawPtr(get<field>(d));
}
} // namespace disk
2 changes: 1 addition & 1 deletion sph/include/sph/hydro_ve/momentum_energy_kern.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ namespace sph
{

template<class Tc, class T>
HOST_DEVICE_FUN T avRvCorrection(util::array<Tc, 3> R, Tc eta_ab, T eta_crit, const util::array<T, 6>& gradV_i,
HOST_DEVICE_FUN T avRvCorrection(util::array<Tc, 3> R, Tc eta_ab, T eta_crit, const util::array<T, 6>& gradV_i,
const util::array<const T, 6>& gradV_j)
{
T dmy1 = dot(R, symv(gradV_i, R));
Expand Down

0 comments on commit 515f635

Please sign in to comment.