From 5f6e0a51268868098e70b0dc4278ca6a36804afc Mon Sep 17 00:00:00 2001 From: nknk567 Date: Wed, 12 Feb 2025 10:56:21 +0100 Subject: [PATCH] Pass data as pointers to GPU instead of dataset --- physics/Disk/include/accretion.hpp | 5 ++- physics/Disk/include/accretion_gpu.cu | 21 +++++++--- physics/Disk/include/accretion_gpu.hpp | 6 ++- physics/Disk/include/betaCooling.hpp | 17 +++++--- physics/Disk/include/betaCooling_gpu.cu | 42 ++++++++++--------- physics/Disk/include/betaCooling_gpu.hpp | 10 +++-- physics/Disk/include/computeCentralForce.hpp | 6 ++- .../Disk/include/computeCentralForce_gpu.cu | 27 +++++++----- .../Disk/include/computeCentralForce_gpu.hpp | 5 ++- physics/Disk/include/get_ptr.hpp | 14 +++++++ .../sph/hydro_ve/momentum_energy_kern.hpp | 2 +- 11 files changed, 101 insertions(+), 54 deletions(-) create mode 100644 physics/Disk/include/get_ptr.hpp diff --git a/physics/Disk/include/accretion.hpp b/physics/Disk/include/accretion.hpp index affc84c2b..cc22b89e8 100644 --- a/physics/Disk/include/accretion.hpp +++ b/physics/Disk/include/accretion.hpp @@ -11,6 +11,7 @@ #include "accretion_gpu.hpp" #include "buffer_reduce.hpp" #include "cstone/primitives/accel_switch.hpp" +#include "get_ptr.hpp" namespace disk { @@ -21,7 +22,9 @@ void computeAccretionCondition(size_t first, size_t last, Dataset& d, StarData& { if constexpr (cstone::HaveGpu{}) { - computeAccretionConditionGPU(first, last, d, star); + computeAccretionConditionGPU(first, last, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"h">(d), + getPtr<"keys">(d), getPtr<"m">(d), getPtr<"vx">(d), getPtr<"vy">(d), + getPtr<"vz">(d), star); } else { computeAccretionConditionImpl(first, last, d, star); } } diff --git a/physics/Disk/include/accretion_gpu.cu b/physics/Disk/include/accretion_gpu.cu index ab7d19bdd..f8eea5065 100644 --- a/physics/Disk/include/accretion_gpu.cu +++ b/physics/Disk/include/accretion_gpu.cu @@ -78,8 +78,10 @@ __global__ void computeAccretionConditionKernel(size_t first, size_t last, const if (threadIdx.x == 0) { atomicAddRS(device_removed, block_removed); } } -template -void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Treal* h, const Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy, + const Thydro* vz, StarData& star) { cstone::LocalIndex numParticles = last - first; constexpr unsigned numThreads = 256; @@ -96,9 +98,8 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat checkGpuErrors(cudaMemcpy(removed_device, &star.removed_local, sizeof star.removed_local, cudaMemcpyHostToDevice)); computeAccretionConditionKernel<<>>( - first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.h), - rawPtr(d.devData.keys), rawPtr(d.devData.m), rawPtr(d.devData.vx), rawPtr(d.devData.vy), rawPtr(d.devData.vz), - star.position, star.inner_size * star.inner_size, star.removal_limit_h, accreted_device, removed_device); + first, last, x, y, z, h, keys, m, vx, vy, vz, star.position, star.inner_size * star.inner_size, + star.removal_limit_h, accreted_device, removed_device); checkGpuErrors(cudaDeviceSynchronize()); checkGpuErrors(cudaGetLastError()); @@ -110,5 +111,13 @@ void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarDat checkGpuErrors(cudaFree(removed_device)); } -template void computeAccretionConditionGPU(size_t, size_t, sphexa::ParticlesData&, StarData&); +#define COMPUTE_ACCRETION_CONDITION_GPU(Treal, Tkeys, Tmass, Thydro) \ + template void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, \ + const Treal* z, const Treal* h, const Tkeys* keys, const Tmass* m, \ + const Thydro* vx, const Thydro* vy, const Thydro* vz, StarData& star); + +COMPUTE_ACCRETION_CONDITION_GPU(double, size_t, double, double); +COMPUTE_ACCRETION_CONDITION_GPU(double, size_t, double, float); +COMPUTE_ACCRETION_CONDITION_GPU(double, size_t, float, float); + } // namespace disk diff --git a/physics/Disk/include/accretion_gpu.hpp b/physics/Disk/include/accretion_gpu.hpp index 45478e2d5..bd4ca4683 100644 --- a/physics/Disk/include/accretion_gpu.hpp +++ b/physics/Disk/include/accretion_gpu.hpp @@ -8,6 +8,8 @@ namespace disk { -template -void computeAccretionConditionGPU(size_t first, size_t last, Dataset& d, StarData& star); +template +void computeAccretionConditionGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Treal* h, const Tkeys* keys, const Tmass* m, const Thydro* vx, const Thydro* vy, + const Thydro* vz, StarData& star); } diff --git a/physics/Disk/include/betaCooling.hpp b/physics/Disk/include/betaCooling.hpp index 36c66ec12..158fa9b9f 100644 --- a/physics/Disk/include/betaCooling.hpp +++ b/physics/Disk/include/betaCooling.hpp @@ -11,6 +11,7 @@ #include "betaCooling_gpu.hpp" #include "cstone/primitives/accel_switch.hpp" +#include "get_ptr.hpp" namespace disk { @@ -34,8 +35,8 @@ void betaCoolingImpl(size_t first, size_t last, Dataset& d, const StarData& star } } -template -auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData& star) +template +auto duTimestepImpl(size_t first, size_t last, const Dataset& d) { using Tu = std::decay_t; @@ -46,12 +47,13 @@ auto duTimestepImpl(size_t first, size_t last, const Dataset& d, const StarData& #pragma omp parallel for reduction(min : duTimestepMin) for (size_t i = first; i < last; i++) { - Tt duTimestep = star.K_u * std::abs(d.u[i] / d.du[i]); + Tt duTimestep = std::abs(d.u[i] / d.du[i]); duTimestepMin = std::min(duTimestepMin, duTimestep); } return duTimestepMin; } +//! @brief Cool the disk with a cooling time proportional to the Keplerian orbital time (around the star) template void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData& star) { @@ -60,12 +62,15 @@ void betaCooling(size_t startIndex, size_t endIndex, Dataset& d, const StarData& { if constexpr (cstone::HaveGpu{}) { - betaCoolingGPU(startIndex, endIndex, d, star); + betaCoolingGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"u">(d), + getPtr<"rho">(d), getPtr<"du">(d), star); } else { betaCoolingImpl(startIndex, endIndex, d, star); } } } +//! @brief Compute a maximal time step depending on how fast the internal energy changes +//! (e.g. due to cooling). template void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star) { @@ -77,9 +82,9 @@ void duTimestep(size_t startIndex, size_t endIndex, Dataset& d, StarData& star) { if constexpr (cstone::HaveGpu{}) { - star.t_du = duTimestepGPU(startIndex, endIndex, d, star); + star.t_du = star.K_u * duTimestepGPU(startIndex, endIndex, getPtr<"u">(d), getPtr<"du">(d)); } - else { star.t_du = duTimestepImpl(startIndex, endIndex, d, star); } + else { star.t_du = star.K_u * duTimestepImpl(startIndex, endIndex, d); } } } diff --git a/physics/Disk/include/betaCooling_gpu.cu b/physics/Disk/include/betaCooling_gpu.cu index b54df04ef..843fd83e1 100644 --- a/physics/Disk/include/betaCooling_gpu.cu +++ b/physics/Disk/include/betaCooling_gpu.cu @@ -21,10 +21,11 @@ namespace disk { -template -__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z, Tdu* du, - const Tu* u, Ts star_mass, cstone::Vec3 star_position, Ts beta, Tpos g, - const Trho* rho, Ts u_floor, Trho2 cooling_rho_limit) +template +__global__ void betaCoolingGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, + const Treal* u, const Thydro* rho, Treal* du, auto star_mass, + cstone::Vec3 star_position, auto beta, auto g, auto u_floor, + auto cooling_rho_limit) { cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x; @@ -40,22 +41,26 @@ __global__ void betaCoolingGPUKernel(size_t first, size_t last, const Tpos* x, c du[i] += -u[i] * omega / beta; } -template -void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u, + const Thydro* rho, Treal* du, StarData& star); { cstone::LocalIndex numParticles = last - first; unsigned numThreads = 256; unsigned numBlocks = (numParticles + numThreads - 1) / numThreads; - betaCoolingGPUKernel<<>>(first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), - rawPtr(d.devData.z), rawPtr(d.devData.du), rawPtr(d.devData.u), - star.m, star.position, star.beta, d.g, rawPtr(d.devData.rho), - star.u_floor, star.cooling_rho_limit); + betaCoolingGPUKernel<<>>(first, last, x, y, z, u, rho, du, star.m, star.position, star.beta, + d.g, star.u_floor, star.cooling_rho_limit); checkGpuErrors(cudaDeviceSynchronize()); } -template void betaCoolingGPU(size_t, size_t, sphexa::ParticlesData&, const StarData&); +#define BETA_COOLING_GPU(Treal, Thydro) \ + template void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, \ + const Treal* u, const Thydro* rho, Treal* du, StarData& star); + +BETA_COOLING_GPU(double, double); +BETA_COOLING_GPU(double, float); template struct AbsDivide @@ -66,14 +71,11 @@ struct AbsDivide } }; -template -double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star) +template +double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du); { cstone::LocalIndex numParticles = last - first; - const auto* u = rawPtr(d.devData.u); - const auto* du = rawPtr(d.devData.du); - using Tu = std::decay_t; using Tdu = std::decay_t; @@ -82,10 +84,12 @@ double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData double init = INFINITY; - return star.K_u * - thrust::transform_reduce(thrust::device, begin, end, AbsDivide{}, init, thrust::minimum{}); + return thrust::transform_reduce(thrust::device, begin, end, AbsDivide{}, init, thrust::minimum{}); } -template double duTimestepGPU(size_t, size_t, const sphexa::ParticlesData&, const StarData&); +#define DU_TIMESTEP_GPU(Treal) \ + template double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du); + +DU_TIMESTEP_GPU(double); } // namespace disk diff --git a/physics/Disk/include/betaCooling_gpu.hpp b/physics/Disk/include/betaCooling_gpu.hpp index dfb3e0f71..8289e753d 100644 --- a/physics/Disk/include/betaCooling_gpu.hpp +++ b/physics/Disk/include/betaCooling_gpu.hpp @@ -6,10 +6,12 @@ namespace disk { -template -void betaCoolingGPU(size_t first, size_t last, Dataset& d, StarData& star); -template -double duTimestepGPU(size_t first, size_t last, const Dataset& d, const StarData& star); +template +void betaCoolingGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, const Treal* u, + const Thydro* rho, Treal* du, StarData& star); + +template +double duTimestepGPU(size_t first, size_t last, const Treal* u, const Treal* du); } // namespace disk diff --git a/physics/Disk/include/computeCentralForce.hpp b/physics/Disk/include/computeCentralForce.hpp index ea70e8cd9..83022dd64 100644 --- a/physics/Disk/include/computeCentralForce.hpp +++ b/physics/Disk/include/computeCentralForce.hpp @@ -10,6 +10,7 @@ #include "cstone/primitives/accel_switch.hpp" #include "cstone/tree/definitions.h" #include "computeCentralForce_gpu.hpp" +#include "get_ptr.hpp" namespace disk { @@ -20,7 +21,7 @@ void computeCentralForceImpl(size_t first, size_t last, Dataset& d, StarData& st cstone::Vec4 force_local{}; const double inner_size2 = star.inner_size * star.inner_size; -#pragma omp declare reduction(add_force : cstone::Vec4 : omp_out = omp_out + omp_in) initializer(omp_priv = {}) +#pragma omp declare reduction(add_force : cstone::Vec4 : omp_out = omp_out + omp_in) initializer(omp_priv = {}) #pragma omp parallel for reduction(add_force : force_local) for (size_t i = first; i < last; i++) @@ -54,7 +55,8 @@ void computeCentralForce(size_t startIndex, size_t endIndex, Dataset& d, StarDat { if constexpr (cstone::HaveGpu{}) { - computeCentralForceGPU(startIndex, endIndex, d, star); + computeCentralForceGPU(startIndex, endIndex, getPtr<"x">(d), getPtr<"y">(d), getPtr<"z">(d), getPtr<"ax">(d), + getPtr<"ay">(d), getPtr<"az">(d), getPtr<"m">(d), star); } else { computeCentralForceImpl(startIndex, endIndex, d, star); } } diff --git a/physics/Disk/include/computeCentralForce_gpu.cu b/physics/Disk/include/computeCentralForce_gpu.cu index 6c986ec8e..1ea11f3b3 100644 --- a/physics/Disk/include/computeCentralForce_gpu.cu +++ b/physics/Disk/include/computeCentralForce_gpu.cu @@ -26,11 +26,10 @@ __device__ void atomicAddVec4(cstone::Vec4* x, const cstone::Vec4& y) atomicAdd(&(*x)[3], y[3]); } -template +template __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tpos* x, const Tpos* y, const Tpos* z, - Ta* ax, Ta* ay, Ta* az, const Tm* m, const cstone::Vec3 star_position, - Tsm sm, Tg g, Tis inner_size2, Tf* force_device) + Ta* ax, Ta* ay, Ta* az, const auto* m, const cstone::Vec3 star_position, + auto sm, auto g, auto inner_size2, auto* force_device) { cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x; Tf force{}; @@ -67,8 +66,9 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tp if (threadIdx.x == 0) { atomicAddVec4(force_device, force_block); } } -template -void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star) +template +void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Treal* ax, + Treal* ay, Treal* az, const Tmass* m, StarData& star) { cstone::LocalIndex numParticles = last - first; constexpr unsigned numThreads = 256; @@ -79,10 +79,9 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta checkGpuErrors(cudaMalloc(reinterpret_cast(&force_device), sizeof *force_device)); checkGpuErrors(cudaMemcpy(force_device, &star.force_local, sizeof star.force_local, cudaMemcpyHostToDevice)); - computeCentralForceGPUKernel<<>>( - first, last, rawPtr(d.devData.x), rawPtr(d.devData.y), rawPtr(d.devData.z), rawPtr(d.devData.ax), - rawPtr(d.devData.ay), rawPtr(d.devData.az), rawPtr(d.devData.m), star.position, star.m, d.g, - star.inner_size * star.inner_size, force_device); + computeCentralForceGPUKernel + <<>>(first, last, x, y, z, ax, ay, az, m, star.position, star.m, d.g, + star.inner_size * star.inner_size, force_device); checkGpuErrors(cudaDeviceSynchronize()); checkGpuErrors(cudaGetLastError()); @@ -91,5 +90,11 @@ void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& sta checkGpuErrors(cudaFree(force_device)); } -template void computeCentralForceGPU(size_t, size_t, sphexa::ParticlesData&, StarData&); +#define COMPUTE_CENTRAL_FORCE_GPU(Treal, Tmass) \ + template void computeCentralForceGPU(size_t, size_t, const Treal* x, const Treal* y, const Treal* z, Treal* ax, \ + Treal* ay, Treal* az, const Tmass* m, StarData&); + +COMPUTE_CENTRAL_FORCE_GPU(double, double); +COMPUTE_CENTRAL_FORCE_GPU(double, float); + } // namespace disk diff --git a/physics/Disk/include/computeCentralForce_gpu.hpp b/physics/Disk/include/computeCentralForce_gpu.hpp index bdf4b66be..481b48f29 100644 --- a/physics/Disk/include/computeCentralForce_gpu.hpp +++ b/physics/Disk/include/computeCentralForce_gpu.hpp @@ -6,6 +6,7 @@ namespace disk { -template -void computeCentralForceGPU(size_t first, size_t last, Dataset& d, StarData& star); +template +void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z, Treal* ax, + Treal* ay, Treal* az, const Tmass* m, StarData& star); } diff --git a/physics/Disk/include/get_ptr.hpp b/physics/Disk/include/get_ptr.hpp new file mode 100644 index 000000000..211495bea --- /dev/null +++ b/physics/Disk/include/get_ptr.hpp @@ -0,0 +1,14 @@ +// +// Created by Noah Kubli on 12.02.2025. +// + +#pragma once + +namespace disk +{ +template +auto* getPtr(Dataset& d) +{ + return rawPtr(get(d)); +} +} // namespace disk diff --git a/sph/include/sph/hydro_ve/momentum_energy_kern.hpp b/sph/include/sph/hydro_ve/momentum_energy_kern.hpp index 21d8ede46..879c18081 100644 --- a/sph/include/sph/hydro_ve/momentum_energy_kern.hpp +++ b/sph/include/sph/hydro_ve/momentum_energy_kern.hpp @@ -41,7 +41,7 @@ namespace sph { template -HOST_DEVICE_FUN T avRvCorrection(util::array R, Tc eta_ab, T eta_crit, const util::array& gradV_i, +HOST_DEVICE_FUN T avRvCorrection(util::array R, Tc eta_ab, T eta_crit, const util::array& gradV_i, const util::array& gradV_j) { T dmy1 = dot(R, symv(gradV_i, R));