use static device variable instead of cudaMalloc in computeCentralFor…

…ceGPU
sphexa-org · Feb 12, 2025 · 3708232 · 3708232
1 parent b1789ec
commit 3708232
Showing 1 changed file with 14 additions and 14 deletions.
diff --git a/physics/Disk/include/computeCentralForce_gpu.cu b/physics/Disk/include/computeCentralForce_gpu.cu
@@ -18,6 +18,8 @@
 namespace disk
 {
 
+static __device__ cstone::Vec4<double> force_device;
+
 template<typename T>
 __device__ void atomicAddVec4(cstone::Vec4<T>* x, const cstone::Vec4<T>& y)
 {
@@ -30,11 +32,11 @@ __device__ void atomicAddVec4(cstone::Vec4<T>* x, const cstone::Vec4<T>& y)
 template<size_t numThreads, typename Treal, typename Tmass, typename Ta, typename Tstar, typename Tforce>
 __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Treal* x, const Treal* y, const Treal* z,
                                              Ta* ax, Ta* ay, Ta* az, const Tmass* m,
-                                             const cstone::Vec3<Tstar> star_position, Tstar m_star, Treal g, Tstar inner_size2,
-                                             Tforce* force_device)
+                                             const cstone::Vec3<Tstar> star_position, Tstar m_star, Treal g,
+                                             Tstar inner_size2)
 {
     cstone::LocalIndex i = first + blockDim.x * blockIdx.x + threadIdx.x;
-    Tforce force{};
+    Tforce             force{};
 
     if (i >= last) { force = {0., 0., 0., 0.}; }
     else
@@ -60,12 +62,12 @@ __global__ void computeCentralForceGPUKernel(size_t first, size_t last, const Tr
         force[3] = -az_i * m[i];
     }
 
-    typedef cub::BlockReduce<Tforce, numThreads>     BlockReduce;
+    typedef cub::BlockReduce<Tforce, numThreads> BlockReduce;
     __shared__ typename BlockReduce::TempStorage temp_storage;
 
     Tforce force_block = BlockReduce(temp_storage).Sum(force);
     __syncthreads();
-    if (threadIdx.x == 0) { atomicAddVec4(force_device, force_block); }
+    if (threadIdx.x == 0) { atomicAddVec4(&force_device, force_block); }
 }
 
 template<typename Treal, typename Thydro, typename Tmass>
@@ -76,23 +78,21 @@ void computeCentralForceGPU(size_t first, size_t last, const Treal* x, const Tre
     constexpr unsigned numThreads   = 256;
     unsigned           numBlocks    = (numParticles + numThreads - 1) / numThreads;
 
-    star.force_local = {};
-    cstone::Vec4<double>* force_device;
-    checkGpuErrors(cudaMalloc(reinterpret_cast<void**>(&force_device), sizeof *force_device));
-    checkGpuErrors(cudaMemcpy(force_device, &star.force_local, sizeof star.force_local, cudaMemcpyHostToDevice));
+    cstone::Vec4<double> force_local{0., 0., 0., 0.};
+    checkGpuErrors(cudaMemcpyToSymbol(GPU_SYMBOL(force_device), &force_local, sizeof(force_local)));
 
     computeCentralForceGPUKernel<numThreads><<<numBlocks, numThreads>>>(
-        first, last, x, y, z, ax, ay, az, m, star.position, star.m, g, star.inner_size * star.inner_size, force_device);
+        first, last, x, y, z, ax, ay, az, m, star.position, star.m, g, star.inner_size * star.inner_size);
 
     checkGpuErrors(cudaDeviceSynchronize());
     checkGpuErrors(cudaGetLastError());
 
-    checkGpuErrors(cudaMemcpy(&star.force_local, force_device, sizeof star.force_local, cudaMemcpyDeviceToHost));
-    checkGpuErrors(cudaFree(force_device));
+    checkGpuErrors(cudaMemcpyFromSymbol(&force_local, GPU_SYMBOL(force_device), sizeof(force_local)));
+    star.force_local = force_local;
 }
 
-#define COMPUTE_CENTRAL_FORCE_GPU(Treal, Thydro, Tmass)                                                                        \
-    template void computeCentralForceGPU(size_t, size_t, const Treal* x, const Treal* y, const Treal* z, Thydro* ax,    \
+#define COMPUTE_CENTRAL_FORCE_GPU(Treal, Thydro, Tmass)                                                                \
+    template void computeCentralForceGPU(size_t, size_t, const Treal* x, const Treal* y, const Treal* z, Thydro* ax,   \
                                          Thydro* ay, Thydro* az, const Tmass* m, const Treal g, StarData&);
 
 COMPUTE_CENTRAL_FORCE_GPU(double, double, double);