fix devicereducesum calls inside if statements.

AMLattanzi · Jul 31, 2024 · 1e60e77 · 1e60e77
1 parent 71431cc
commit 1e60e77
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 24 deletions.
diff --git a/Source/Diffusion/PBLModels.cpp b/Source/Diffusion/PBLModels.cpp
@@ -98,12 +98,15 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
                     AMREX_ASSERT_WITH_MESSAGE(qvel(i,j,k) > 0.0, "QKE must have a positive value");
                     AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");
 
-                    if (sbx.contains(i,j,k)) {
-                        const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
-                        const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
-                        Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz, handler);
-                        Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz, handler);
-                    }
+                    // NOTE: This factor is to avoid an if statment that will break
+                    //       the devicereducesum since all threads won't participate.
+                    //       This more performent than Gpu::Atomic::Add.
+                    Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
+                    const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
+                    const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
+                    Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz*fac, handler);
+                    Gpu::deviceReduceSum(&qint(i,j,0,1),      qvel(i,j,k)*dz*fac, handler);
+
                 });
             } else {
                 ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
@@ -115,11 +118,14 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
                     AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");
 
                     // Not multiplying by dz: its constant and would fall out when we divide qint0/qint1 anyway
-                    if (sbx.contains(i,j,k)) {
-                        const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
-                        Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k), handler);
-                        Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k), handler);
-                    }
+
+                    // NOTE: This factor is to avoid an if statment that will break
+                    //       the devicereducesum since all threads won't participate.
+                    //       This more performent than Gpu::Atomic::Add.
+                    Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
+                    const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
+                    Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*fac, handler);
+                    Gpu::deviceReduceSum(&qint(i,j,0,1),      qvel(i,j,k)*fac, handler);
                 });
             }
 
@@ -157,7 +163,7 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
 
                 // Spatially varying MOST
                 Real surface_heat_flux = -u_star_arr(i,j,0) * t_star_arr(i,j,0);
-                Real theta0            = tm_arr(i,j,0); // TODO: IS THIS ACTUALLY RHOTHETA
+                Real theta0            = tm_arr(i,j,0);
                 Real l_obukhov;
                 if (std::abs(surface_heat_flux) > eps) {
                     l_obukhov = ( theta0 * u_star_arr(i,j,0) * u_star_arr(i,j,0) ) /

diff --git a/Source/SourceTerms/ERF_make_mom_sources.cpp b/Source/SourceTerms/ERF_make_mom_sources.cpp
@@ -151,8 +151,8 @@ void make_mom_sources (int /*level*/,
         Gpu::HostVector<    Real> u_plane_h(u_ncell), v_plane_h(v_ncell);
         Gpu::DeviceVector<  Real> u_plane_d(u_ncell), v_plane_d(v_ncell);
 
-        u_ave.line_average(0 , u_plane_h);
-        v_ave.line_average(0 , v_plane_h);
+        u_ave.line_average(0, u_plane_h);
+        v_ave.line_average(0, v_plane_h);
 
         Gpu::copyAsync(Gpu::hostToDevice, u_plane_h.begin(), u_plane_h.end(), u_plane_d.begin());
         Gpu::copyAsync(Gpu::hostToDevice, v_plane_h.begin(), v_plane_h.end(), v_plane_d.begin());

diff --git a/Source/Utils/DirectionSelector.H b/Source/Utils/DirectionSelector.H
@@ -63,7 +63,7 @@ PerpendicularBox (const amrex::Box& bx, const amrex::IntVect& iv)
         plane_hi = {bx.bigEnd(0), bx.bigEnd(1), iv[2]};
     }
 
-    amrex::Box pbx(plane_lo, plane_hi);
+    amrex::Box pbx(plane_lo, plane_hi, bx.type());
 
     return pbx;
 }
@@ -95,7 +95,7 @@ ParallelBox (const amrex::Box& bx, const amrex::IntVect& iv)
         line_hi = {iv[0], iv[1], bx.bigEnd(2)};
     }
 
-    amrex::Box lbx(line_lo, line_hi);
+    amrex::Box lbx(line_lo, line_hi, bx.type());
 
     return lbx;
 }

diff --git a/Source/Utils/PlaneAverage.H b/Source/Utils/PlaneAverage.H
@@ -111,8 +111,10 @@ PlaneAverage::PlaneAverage (const amrex::MultiFab* field_in,
     m_ncell_line = dom_hi[m_axis] - dom_lo[m_axis] + 1 + m_ixtype[m_axis];
 
     m_ncell_plane = 1;
+    auto period = m_geom.periodicity();
     for (int i = 0; i < AMREX_SPACEDIM; ++i) {
-       if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + m_ixtype[m_axis]);
+      int p_fac = (!period.isPeriodic(i)) ? 1 : 0;
+       if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + p_fac*m_ixtype[i]);
     }
 
     m_line_average.resize(static_cast<size_t>(m_ncell_line) * m_ncomp, 0.0);
@@ -202,9 +204,9 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
         const amrex::Array4<const amrex::Real>& fab_arr  = mfab.const_array(mfi);
         const amrex::Array4<const int        >& mask_arr = mask->const_array(mfi);
 
-        ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
-                    AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
-                                      amrex::Gpu::Handler const& handler) noexcept
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
+                           AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
+                                             amrex::Gpu::Handler const& handler) noexcept
         {
             // Loop over the direction perpendicular to the plane.
             // This reduces the atomic pressure on the destination arrays.
@@ -215,10 +217,12 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
                     for (int i = lbx.smallEnd(0); i <= lbx.bigEnd(0); ++i) {
                         int ind = idxOp.getIndx(i, j, k) + offset;
                         for (int n = 0; n < ncomp; ++n) {
-                            if (mask_arr(i,j,k)) {
-                                amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
-                                                            fab_arr(i, j, k, n) * denom, handler);
-                            }
+                            // NOTE: This factor is to avoid an if statment that will break
+                            //       the devicereducesum since all threads won't participate.
+                            //       This more performent than Gpu::Atomic::Add.
+                            amrex::Real fac = (mask_arr(i,j,k)) ? 1.0 : 0.0;
+                            amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
+                                                        fab_arr(i, j, k, n) * denom * fac, handler);
                         }
                     }
                 }