From b1789ec51249c73538285817f2930f14b05548dc Mon Sep 17 00:00:00 2001 From: nknk567 Date: Wed, 12 Feb 2025 10:56:21 +0100 Subject: [PATCH] Pass data as pointers to GPU instead of dataset --- physics/Disk/include/accretion.hpp | 5 ++- physics/Disk/include/accretion_gpu.cu | 21 +++++++--- physics/Disk/include/accretion_gpu.hpp | 6 ++- physics/Disk/include/betaCooling.hpp | 17 +++++--- physics/Disk/include/betaCooling_gpu.cu | 41 ++++++++++--------- physics/Disk/include/betaCooling_gpu.hpp | 12 ++++-- physics/Disk/include/computeCentralForce.hpp | 6 ++- .../Disk/include/computeCentralForce_gpu.cu | 37 ++++++++++------- .../Disk/include/computeCentralForce_gpu.hpp | 6 ++- physics/Disk/include/get_ptr.hpp | 14 +++++++ physics/Disk/include/star_data.hpp | 2 +- .../sph/hydro_ve/momentum_energy_kern.hpp | 2 +- 12 files changed, 110 insertions(+), 59 deletions(-) create mode 100644 physics/Disk/include/get_ptr.hpp diff --git a/physics/Disk/include/accretion.hpp b/physics/Disk/include/accretion.hpp index affc84c2b..cc22b89e8 100644 --- a/physics/Disk/include/accretion.hpp +++ b/physics/Disk/include/accretion.hpp @@ -11,6 +11,7 @@ #include "accretion_gpu.hpp" #include "buffer_reduce.hpp" #include "cstone/primitives/accel_switch.hpp" +#include "get_ptr.hpp" namespace disk { @@ -21,7 +22,9 @@ void computeAccretionCondition(size_t first, size_t last, Dataset& d, StarData& { if constexpr (cstone::HaveGpu{}) { - computeAccretionConditionGPU(first, last, d, star); + computeAccretionConditionGPU(first, last, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"h">(d), + getPtr<"keys">(d), getPtr<"m">(d), getPtr<"vx">(d), getPtr<"vy">(d), + getPtr<"vz">(d), star); } else { computeAccretionConditionImpl(first, last, d, star); } } diff --git a/physics/Disk/include/accretion_gpu.cu b/physics/Disk/include/accretion_gpu.cu index ab7d19bdd..574c9f314 100644 --- a/physics/Disk/include/accretion_gpu.cu +++ b/physics/Disk/include/accretion_gpu.cu @@ -78,8 +78,10 @@ __global__ void computeAccretionConditionKernel(size_t first, size_t last, const if (threadIdx.x == 0) { atomicAddRS(device_removed, block_removed); } } -template -void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Thydro* h, Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy, + const Thydro* vz, StarData& star) { cstone::LocalIndex numParticles = last - first; constexpr unsigned numThreads = 256; @@ -96,9 +98,8 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat checkGpuErrors(cudaMemcpy(removed_device, &star.removed_local, sizeof star.removed_local, cudaMemcpyHostToDevice)); computeAccretionConditionKernel<<>>( - first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.h), - rawPtr(d.devData.keys), rawPtr(d.devData.m), rawPtr(d.devData.vx), rawPtr(d.devData.vy), rawPtr(d.devData.vz), - star.position, star.inner_size * star.inner_size, star.removal_limit_h, accreted_device, removed_device); + first, last, x, y, z, h, keys, m, vx, vy, vz, star.position, star.inner_size * star.inner_size, + star.removal_limit_h, accreted_device, removed_device); checkGpuErrors(cudaDeviceSynchronize()); checkGpuErrors(cudaGetLastError()); @@ -110,5 +111,13 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat checkGpuErrors(cudaFree(removed_device)); } -template void computeAccretionConditionGPU(size_t, size_t, sphexa::ParticlesData&, StarData&); +#define COMPUTE_ACCRETION_CONDITION_GPU(Treal, Thydro, Tkeys, Tmass) \ + template void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, \ + const Treal* z, const Thydro* h, Tkeys* keys, const Tmass* m, \ + const Thydro* vx, const Thydro* vy, const Thydro* vz, StarData& star); + +COMPUTE_ACCRETION_CONDITION_GPU(double, double, size_t, double); +COMPUTE_ACCRETION_CONDITION_GPU(double, float, size_t, double); +COMPUTE_ACCRETION_CONDITION_GPU(double, float, size_t, float); + } // namespace disk diff --git a/physics/Disk/include/accretion_gpu.hpp b/physics/Disk/include/accretion_gpu.hpp index 45478e2d5..db335e617 100644 --- a/physics/Disk/include/accretion_gpu.hpp +++ b/physics/Disk/include/accretion_gpu.hpp @@ -8,6 +8,8 @@ namespace disk { -template -void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star); +template +void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Thydro* h, Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy, + const Thydro* vz, StarData& star); } diff --git a/physics/Disk/include/betaCooling.hpp b/physics/Disk/include/betaCooling.hpp index 36c66ec12..6b0e85990 100644 --- a/physics/Disk/include/betaCooling.hpp +++ b/physics/Disk/include/betaCooling.hpp @@ -11,6 +11,7 @@ #include "betaCooling_gpu.hpp" #include "cstone/primitives/accel_switch.hpp" +#include "get_ptr.hpp" namespace disk { @@ -34,8 +35,8 @@ void betaCoolingImpl(size_t first, size_t last, Dataset& d, const StarData& star } } -template -auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData& star) +template +auto duTimestepImpl(size_t first, size_t last, const Dataset& d) { using Tu = std::decay_t; @@ -46,12 +47,13 @@ auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData& #pragma omp parallel for reduction(min : duTimestepMin) for (size_t i = first; i < last; i++) { - Tt duTimestep = star.K_u * std::abs(d.u[i] / d.du[i]); + Tt duTimestep = std::abs(d.u[i] / d.du[i]); duTimestepMin = std::min(duTimestepMin, duTimestep); } return duTimestepMin; } +//! @brief Cool the disk with a cooling time proportional to the Keplerian orbital time (around the star) template void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData& star) { @@ -60,12 +62,15 @@ void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData& { if constexpr (cstone::HaveGpu{}) { - betaCoolingGPU(startIndex, endIndex, d, star); + betaCoolingGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"u">(d), + getPtr<"rho">(d), getPtr<"du">(d), d.g, star); } else { betaCoolingImpl(startIndex, endIndex, d, star); } } } +//! @brief Compute a maximal time step depending on how fast the internal energy changes +//! (e.g. due to cooling). template void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star) { @@ -77,9 +82,9 @@ void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star) { if constexpr (cstone::HaveGpu{}) { - star.t_du = duTimestepGPU(startIndex, endIndex, d, star); + star.t_du = star.K_u * duTimestepGPU(startIndex, endIndex, getPtr<"u">(d), getPtr<"du">(d)); } - else { star.t_du = duTimestepImpl(startIndex, endIndex, d, star); } + else { star.t_du = star.K_u * duTimestepImpl(startIndex, endIndex, d); } } } diff --git a/physics/Disk/include/betaCooling_gpu.cu b/physics/Disk/include/betaCooling_gpu.cu index b54df04ef..a737cc4ce 100644 --- a/physics/Disk/include/betaCooling_gpu.cu +++ b/physics/Disk/include/betaCooling_gpu.cu @@ -21,10 +21,10 @@ namespace disk { -template -__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z, Tdu* du, - const Tu* u, Ts star_mass, cstone::Vec3 star_position, Ts beta, Tpos g, - const Trho* rho, Ts u_floor, Trho2 cooling_rho_limit) +template +__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Treal* u, const Thydro* rho, Treal* du, Treal g, Ts star_mass, + cstone::Vec3 star_position, Ts beta, Ts u_floor, Ts cooling_rho_limit) { cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x; @@ -40,22 +40,26 @@ __global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, c du[i] += -u[i] * omega / beta; } -template -void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u, + const Thydro* rho, Treal* du, const Treal g, const StarData& star) { cstone::LocalIndex numParticles = last - first; unsigned numThreads = 256; unsigned numBlocks = (numParticles + numThreads - 1) / numThreads; - betaCoolingGPUKernel<<>>(first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), - rawPtr(d.devData.z), rawPtr(d.devData.du), rawPtr(d.devData.u), - star.m, star.position, star.beta, d.g, rawPtr(d.devData.rho), - star.u_floor, star.cooling_rho_limit); + betaCoolingGPUKernel<<>>(first, last, x, y, z, u, rho, du, g, star.m, star.position, + star.beta, star.u_floor, star.cooling_rho_limit); checkGpuErrors(cudaDeviceSynchronize()); } -template void betaCoolingGPU(size_t, size_t, sphexa::ParticlesData&, const StarData&); +#define BETA_COOLING_GPU(Treal, Thydro) \ + template void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, \ + const Treal* u, const Thydro* rho, Treal* du, const Treal g, const StarData& star); + +BETA_COOLING_GPU(double, double); +BETA_COOLING_GPU(double, float); template struct AbsDivide @@ -66,14 +70,11 @@ struct AbsDivide } }; -template -double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star) +template +double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du) { cstone::LocalIndex numParticles = last - first; - const auto* u = rawPtr(d.devData.u); - const auto* du = rawPtr(d.devData.du); - using Tu = std::decay_t; using Tdu = std::decay_t; @@ -82,10 +83,12 @@ double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData double init = INFINITY; - return star.K_u * - thrust::transform_reduce(thrust::device, begin, end, AbsDivide{}, init, thrust::minimum{}); + return thrust::transform_reduce(thrust::device, begin, end, AbsDivide{}, init, thrust::minimum{}); } -template double duTimestepGPU(size_t, size_t, const sphexa::ParticlesData&, const StarData&); +#define DU_TIMESTEP_GPU(Treal) \ + template double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du); + +DU_TIMESTEP_GPU(double); } // namespace disk diff --git a/physics/Disk/include/betaCooling_gpu.hpp b/physics/Disk/include/betaCooling_gpu.hpp index dfb3e0f71..e6145dd09 100644 --- a/physics/Disk/include/betaCooling_gpu.hpp +++ b/physics/Disk/include/betaCooling_gpu.hpp @@ -4,12 +4,16 @@ #pragma once +#include "star_data.hpp" + namespace disk { -template -void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star); -template -double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star); +template +void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u, + const Thydro* rho, Treal* du, const Treal g, const StarData& star); + +template +double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du); } // namespace disk diff --git a/physics/Disk/include/computeCentralForce.hpp b/physics/Disk/include/computeCentralForce.hpp index ea70e8cd9..62b8da007 100644 --- a/physics/Disk/include/computeCentralForce.hpp +++ b/physics/Disk/include/computeCentralForce.hpp @@ -10,6 +10,7 @@ #include "cstone/primitives/accel_switch.hpp" #include "cstone/tree/definitions.h" #include "computeCentralForce_gpu.hpp" +#include "get_ptr.hpp" namespace disk { @@ -20,7 +21,7 @@ void computeCentralForceImpl(size_t first, size_t last, Dataset& d, StarData& st cstone::Vec4 force_local{}; const double inner_size2 = star.inner_size * star.inner_size; -#pragma omp declare reduction(add_force : cstone::Vec4 : omp_out = omp_out + omp_in) initializer(omp_priv = {}) +#pragma omp declare reduction(add_force : cstone::Vec4 : omp_out = omp_out + omp_in) initializer(omp_priv = {}) #pragma omp parallel for reduction(add_force : force_local) for (size_t i = first; i < last; i++) @@ -54,7 +55,8 @@ void computeCentralForce(size_t startIndex, size_t endIndex, Dataset& d, StarDat { if constexpr (cstone::HaveGpu{}) { - computeCentralForceGPU(startIndex, endIndex, d, star); + computeCentralForceGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"ax">(d), + getPtr<"ay">(d), getPtr<"az">(d), getPtr<"m">(d), d.g, star); } else { computeCentralForceImpl(startIndex, endIndex, d, star); } } diff --git a/physics/Disk/include/computeCentralForce_gpu.cu b/physics/Disk/include/computeCentralForce_gpu.cu index 6c986ec8e..815e768c3 100644 --- a/physics/Disk/include/computeCentralForce_gpu.cu +++ b/physics/Disk/include/computeCentralForce_gpu.cu @@ -2,6 +2,7 @@ // Created by Noah Kubli on 11.03.2024. // #include +#include #include "cuda_runtime.h" #include "cstone/cuda/cuda_utils.cuh" @@ -26,14 +27,14 @@ __device__ void atomicAddVec4(cstone::Vec4* x, const cstone::Vec4& y) atomicAdd(&(*x)[3], y[3]); } -template -__global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z, - Ta* ax, Ta* ay, Ta* az, const Tm* m, const cstone::Vec3 star_position, - Tsm sm, Tg g, Tis inner_size2, Tf* force_device) +template +__global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + Ta* ax, Ta* ay, Ta* az, const Tmass* m, + const cstone::Vec3 star_position, Tstar m_star, Treal g, Tstar inner_size2, + Tforce* force_device) { cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x; - Tf force{}; + Tforce force{}; if (i >= last) { force = {0., 0., 0., 0.}; } else @@ -45,7 +46,7 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tp const double dist = sqrt(dist2); const double dist3 = dist2 * dist; - const double a_strength = 1. / dist3 * sm * g; + const double a_strength = 1. / dist3 * m_star * g; const double ax_i = -dx * a_strength; const double ay_i = -dy * a_strength; const double az_i = -dz * a_strength; @@ -59,16 +60,17 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tp force[3] = -az_i * m[i]; } - typedef cub::BlockReduce BlockReduce; + typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - Tf force_block = BlockReduce(temp_storage).Sum(force); + Tforce force_block = BlockReduce(temp_storage).Sum(force); __syncthreads(); if (threadIdx.x == 0) { atomicAddVec4(force_device, force_block); } } -template -void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Thydro* ax, + Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData& star) { cstone::LocalIndex numParticles = last - first; constexpr unsigned numThreads = 256; @@ -80,9 +82,7 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta checkGpuErrors(cudaMemcpy(force_device, &star.force_local, sizeof star.force_local, cudaMemcpyHostToDevice)); computeCentralForceGPUKernel<<>>( - first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.ax), - rawPtr(d.devData.ay), rawPtr(d.devData.az), rawPtr(d.devData.m), star.position, star.m, d.g, - star.inner_size * star.inner_size, force_device); + first, last, x, y, z, ax, ay, az, m, star.position, star.m, g, star.inner_size * star.inner_size, force_device); checkGpuErrors(cudaDeviceSynchronize()); checkGpuErrors(cudaGetLastError()); @@ -91,5 +91,12 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta checkGpuErrors(cudaFree(force_device)); } -template void computeCentralForceGPU(size_t, size_t, sphexa::ParticlesData&, StarData&); +#define COMPUTE_CENTRAL_FORCE_GPU(Treal, Thydro, Tmass) \ + template void computeCentralForceGPU(size_t, size_t, const Treal* x, const Treal* y, const Treal* z, Thydro* ax, \ + Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData&); + +COMPUTE_CENTRAL_FORCE_GPU(double, double, double); +COMPUTE_CENTRAL_FORCE_GPU(double, float, double); +COMPUTE_CENTRAL_FORCE_GPU(double, float, float); + } // namespace disk diff --git a/physics/Disk/include/computeCentralForce_gpu.hpp b/physics/Disk/include/computeCentralForce_gpu.hpp index bdf4b66be..0160d9881 100644 --- a/physics/Disk/include/computeCentralForce_gpu.hpp +++ b/physics/Disk/include/computeCentralForce_gpu.hpp @@ -3,9 +3,11 @@ // #pragma once +#include "star_data.hpp" namespace disk { -template -void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star); +template +void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Thydro* ax, + Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData& star); } diff --git a/physics/Disk/include/get_ptr.hpp b/physics/Disk/include/get_ptr.hpp new file mode 100644 index 000000000..211495bea --- /dev/null +++ b/physics/Disk/include/get_ptr.hpp @@ -0,0 +1,14 @@ +// +// Created by Noah Kubli on 12.02.2025. +// + +#pragma once + +namespace disk +{ +template +auto* getPtr(Dataset& d) +{ + return rawPtr(get(d)); +} +} // namespace disk diff --git a/physics/Disk/include/star_data.hpp b/physics/Disk/include/star_data.hpp index a6c40e862..91d85b037 100644 --- a/physics/Disk/include/star_data.hpp +++ b/physics/Disk/include/star_data.hpp @@ -38,7 +38,7 @@ struct StarData double removal_limit_h{std::numeric_limits::infinity()}; //! @brief Don't cool any particle above this density threshold - float cooling_rho_limit{std::numeric_limits::infinity()}; + double cooling_rho_limit{std::numeric_limits::infinity()}; //! @brief Don't cool any particle whose internal energy is below double u_floor{0.}; diff --git a/sph/include/sph/hydro_ve/momentum_energy_kern.hpp b/sph/include/sph/hydro_ve/momentum_energy_kern.hpp index 21d8ede46..879c18081 100644 --- a/sph/include/sph/hydro_ve/momentum_energy_kern.hpp +++ b/sph/include/sph/hydro_ve/momentum_energy_kern.hpp @@ -41,7 +41,7 @@ namespace sph { template -HOST_DEVICE_FUN T avRvCorrection(util::array R, Tc eta_ab, T eta_crit, const util::array& gradV_i, +HOST_DEVICE_FUN T avRvCorrection(util::array R, Tc eta_ab, T eta_crit, const util::array& gradV_i, const util::array& gradV_j) { T dmy1 = dot(R, symv(gradV_i, R));