PlaneAverage GPU (erf-model#1717)

* fix devicereducesum calls inside if statements. * Different approach to avoid touching halo cells inside the domain. * Codespell fix. * Bomex inputs fix. --------- Co-authored-by: Aaron Lattanzi <[email protected]>
AMLattanzi · Aug 1, 2024 · fd1673c · fd1673c
1 parent 7b812de
commit fd1673c
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 26 deletions.
diff --git a/Exec/RegTests/Bomex/input_Kessler b/Exec/RegTests/Bomex/input_Kessler
@@ -55,7 +55,11 @@ erf.plot_vars_1     = density rhotheta x_velocity y_velocity z_velocity pressure
 erf.alpha_T = 0.0
 erf.alpha_C = 0.0
 erf.use_gravity = true
-
+
+erf.use_coriolis    = true
+erf.coriolis_3d     = false
+erf.latitude        = 14.982176712702886  # f = 0.376e-4 1/s
+
 erf.dycore_horiz_adv_type    = Upwind_3rd
 erf.dycore_vert_adv_type     = Upwind_3rd
 erf.dryscal_horiz_adv_type   = Upwind_3rd

diff --git a/Exec/RegTests/Bomex/input_SAM b/Exec/RegTests/Bomex/input_SAM
@@ -56,6 +56,10 @@ erf.alpha_T = 0.0
 erf.alpha_C = 0.0
 erf.use_gravity = true
 
+erf.use_coriolis    = true
+erf.coriolis_3d     = false
+erf.latitude        = 14.982176712702886  # f = 0.376e-4 1/s
+
 erf.dycore_horiz_adv_type    = Upwind_3rd
 erf.dycore_vert_adv_type     = Upwind_3rd
 erf.dryscal_horiz_adv_type   = Upwind_3rd

diff --git a/Source/Diffusion/PBLModels.cpp b/Source/Diffusion/PBLModels.cpp
@@ -98,12 +98,15 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
                     AMREX_ASSERT_WITH_MESSAGE(qvel(i,j,k) > 0.0, "QKE must have a positive value");
                     AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");
 
-                    if (sbx.contains(i,j,k)) {
-                        const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
-                        const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
-                        Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz, handler);
-                        Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz, handler);
-                    }
+                    // NOTE: This factor is to avoid an if statement that will break
+                    //       the devicereducesum since all threads won't participate.
+                    //       This more performant than Gpu::Atomic::Add.
+                    Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
+                    const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
+                    const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
+                    Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz*fac, handler);
+                    Gpu::deviceReduceSum(&qint(i,j,0,1),      qvel(i,j,k)*dz*fac, handler);
+
                 });
             } else {
                 ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
@@ -115,11 +118,14 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
                     AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");
 
                     // Not multiplying by dz: its constant and would fall out when we divide qint0/qint1 anyway
-                    if (sbx.contains(i,j,k)) {
-                        const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
-                        Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k), handler);
-                        Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k), handler);
-                    }
+
+                    // NOTE: This factor is to avoid an if statement that will break
+                    //       the devicereducesum since all threads won't participate.
+                    //       This more performant than Gpu::Atomic::Add.
+                    Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
+                    const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
+                    Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*fac, handler);
+                    Gpu::deviceReduceSum(&qint(i,j,0,1),      qvel(i,j,k)*fac, handler);
                 });
             }
 
@@ -157,7 +163,7 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
 
                 // Spatially varying MOST
                 Real surface_heat_flux = -u_star_arr(i,j,0) * t_star_arr(i,j,0);
-                Real theta0            = tm_arr(i,j,0); // TODO: IS THIS ACTUALLY RHOTHETA
+                Real theta0            = tm_arr(i,j,0);
                 Real l_obukhov;
                 if (std::abs(surface_heat_flux) > eps) {
                     l_obukhov = ( theta0 * u_star_arr(i,j,0) * u_star_arr(i,j,0) ) /

diff --git a/Source/SourceTerms/ERF_make_mom_sources.cpp b/Source/SourceTerms/ERF_make_mom_sources.cpp
@@ -151,8 +151,8 @@ void make_mom_sources (int /*level*/,
         Gpu::HostVector<    Real> u_plane_h(u_ncell), v_plane_h(v_ncell);
         Gpu::DeviceVector<  Real> u_plane_d(u_ncell), v_plane_d(v_ncell);
 
-        u_ave.line_average(0 , u_plane_h);
-        v_ave.line_average(0 , v_plane_h);
+        u_ave.line_average(0, u_plane_h);
+        v_ave.line_average(0, v_plane_h);
 
         Gpu::copyAsync(Gpu::hostToDevice, u_plane_h.begin(), u_plane_h.end(), u_plane_d.begin());
         Gpu::copyAsync(Gpu::hostToDevice, v_plane_h.begin(), v_plane_h.end(), v_plane_d.begin());

diff --git a/Source/Utils/DirectionSelector.H b/Source/Utils/DirectionSelector.H
@@ -63,7 +63,7 @@ PerpendicularBox (const amrex::Box& bx, const amrex::IntVect& iv)
         plane_hi = {bx.bigEnd(0), bx.bigEnd(1), iv[2]};
     }
 
-    amrex::Box pbx(plane_lo, plane_hi);
+    amrex::Box pbx(plane_lo, plane_hi, bx.type());
 
     return pbx;
 }
@@ -95,7 +95,7 @@ ParallelBox (const amrex::Box& bx, const amrex::IntVect& iv)
         line_hi = {iv[0], iv[1], bx.bigEnd(2)};
     }
 
-    amrex::Box lbx(line_lo, line_hi);
+    amrex::Box lbx(line_lo, line_hi, bx.type());
 
     return lbx;
 }

diff --git a/Source/Utils/PlaneAverage.H b/Source/Utils/PlaneAverage.H
@@ -111,8 +111,10 @@ PlaneAverage::PlaneAverage (const amrex::MultiFab* field_in,
     m_ncell_line = dom_hi[m_axis] - dom_lo[m_axis] + 1 + m_ixtype[m_axis];
 
     m_ncell_plane = 1;
+    auto period = m_geom.periodicity();
     for (int i = 0; i < AMREX_SPACEDIM; ++i) {
-       if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + m_ixtype[m_axis]);
+      int p_fac = (!period.isPeriodic(i)) ? 1 : 0;
+       if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + p_fac*m_ixtype[i]);
     }
 
     m_line_average.resize(static_cast<size_t>(m_ncell_line) * m_ncomp, 0.0);
@@ -186,6 +188,8 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
     amrex::Real* line_avg = lavg.data();
     const int ncomp = m_ncomp;
 
+    amrex::Box domain = amrex::convert(m_geom.Domain(),m_ixtype);
+
     amrex::IntVect ng = amrex::IntVect(0);
     int offset = m_ng[m_axis];
     if (m_inc_ghost) ng[m_axis] = offset;
@@ -196,29 +200,33 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
 #endif
     for (amrex::MFIter mfi(mfab, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
-        amrex::Box bx  = mfi.tilebox(m_ixtype, ng);
-        amrex::Box pbx = PerpendicularBox<IndexSelector>(bx, amrex::IntVect(0));
+        amrex::Box tbx  = mfi.tilebox();
+        if (tbx.smallEnd(m_axis) == domain.smallEnd(m_axis)) tbx.growLo(m_axis,offset);
+        if (tbx.bigEnd  (m_axis) == domain.bigEnd  (m_axis)) tbx.growHi(m_axis,offset);
+        amrex::Box pbx = PerpendicularBox<IndexSelector>(tbx, amrex::IntVect(0));
 
         const amrex::Array4<const amrex::Real>& fab_arr  = mfab.const_array(mfi);
         const amrex::Array4<const int        >& mask_arr = mask->const_array(mfi);
 
-        ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
+        amrex::ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
                     AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
                                       amrex::Gpu::Handler const& handler) noexcept
         {
             // Loop over the direction perpendicular to the plane.
             // This reduces the atomic pressure on the destination arrays.
-            amrex::Box lbx = ParallelBox<IndexSelector>(bx, amrex::IntVect{p_i, p_j, p_k});
+            amrex::Box lbx = ParallelBox<IndexSelector>(tbx, amrex::IntVect{p_i, p_j, p_k});
 
             for (int k = lbx.smallEnd(2); k <= lbx.bigEnd(2); ++k) {
                 for (int j = lbx.smallEnd(1); j <= lbx.bigEnd(1); ++j) {
                     for (int i = lbx.smallEnd(0); i <= lbx.bigEnd(0); ++i) {
                         int ind = idxOp.getIndx(i, j, k) + offset;
                         for (int n = 0; n < ncomp; ++n) {
-                            if (mask_arr(i,j,k)) {
-                                amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
-                                                            fab_arr(i, j, k, n) * denom, handler);
-                            }
+                            // NOTE: This factor is to avoid an if statement that will break
+                            //       the devicereducesum since all threads won't participate.
+                            //       This more performant than Gpu::Atomic::Add.
+                            amrex::Real fac = (mask_arr(i,j,k)) ? 1.0 : 0.0;
+                            amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
+                                                            fab_arr(i, j, k, n) * denom * fac, handler);
                         }
                     }
                 }