Skip to content

Commit

Permalink
PlaneAverage GPU (erf-model#1717)
Browse files Browse the repository at this point in the history
* fix devicereducesum calls inside if statements.

* Different approach to avoid touching halo cells inside the domain.

* Codespell fix.

* Bomex inputs fix.

---------

Co-authored-by: Aaron Lattanzi <[email protected]>
  • Loading branch information
AMLattanzi and Aaron Lattanzi authored Aug 1, 2024
1 parent 7b812de commit fd1673c
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 26 deletions.
6 changes: 5 additions & 1 deletion Exec/RegTests/Bomex/input_Kessler
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,11 @@ erf.plot_vars_1 = density rhotheta x_velocity y_velocity z_velocity pressure
erf.alpha_T = 0.0
erf.alpha_C = 0.0
erf.use_gravity = true


erf.use_coriolis = true
erf.coriolis_3d = false
erf.latitude = 14.982176712702886 # f = 0.376e-4 1/s

erf.dycore_horiz_adv_type = Upwind_3rd
erf.dycore_vert_adv_type = Upwind_3rd
erf.dryscal_horiz_adv_type = Upwind_3rd
Expand Down
4 changes: 4 additions & 0 deletions Exec/RegTests/Bomex/input_SAM
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ erf.alpha_T = 0.0
erf.alpha_C = 0.0
erf.use_gravity = true

erf.use_coriolis = true
erf.coriolis_3d = false
erf.latitude = 14.982176712702886 # f = 0.376e-4 1/s

erf.dycore_horiz_adv_type = Upwind_3rd
erf.dycore_vert_adv_type = Upwind_3rd
erf.dryscal_horiz_adv_type = Upwind_3rd
Expand Down
30 changes: 18 additions & 12 deletions Source/Diffusion/PBLModels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,15 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
AMREX_ASSERT_WITH_MESSAGE(qvel(i,j,k) > 0.0, "QKE must have a positive value");
AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");

if (sbx.contains(i,j,k)) {
const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz, handler);
}
// NOTE: This factor is to avoid an if statement that will break
// the devicereducesum since all threads won't participate.
// This more performant than Gpu::Atomic::Add.
Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz*fac, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz*fac, handler);

});
} else {
ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
Expand All @@ -115,11 +118,14 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");

// Not multiplying by dz: its constant and would fall out when we divide qint0/qint1 anyway
if (sbx.contains(i,j,k)) {
const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k), handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k), handler);
}

// NOTE: This factor is to avoid an if statement that will break
// the devicereducesum since all threads won't participate.
// This more performant than Gpu::Atomic::Add.
Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*fac, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*fac, handler);
});
}

Expand Down Expand Up @@ -157,7 +163,7 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,

// Spatially varying MOST
Real surface_heat_flux = -u_star_arr(i,j,0) * t_star_arr(i,j,0);
Real theta0 = tm_arr(i,j,0); // TODO: IS THIS ACTUALLY RHOTHETA
Real theta0 = tm_arr(i,j,0);
Real l_obukhov;
if (std::abs(surface_heat_flux) > eps) {
l_obukhov = ( theta0 * u_star_arr(i,j,0) * u_star_arr(i,j,0) ) /
Expand Down
4 changes: 2 additions & 2 deletions Source/SourceTerms/ERF_make_mom_sources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ void make_mom_sources (int /*level*/,
Gpu::HostVector< Real> u_plane_h(u_ncell), v_plane_h(v_ncell);
Gpu::DeviceVector< Real> u_plane_d(u_ncell), v_plane_d(v_ncell);

u_ave.line_average(0 , u_plane_h);
v_ave.line_average(0 , v_plane_h);
u_ave.line_average(0, u_plane_h);
v_ave.line_average(0, v_plane_h);

Gpu::copyAsync(Gpu::hostToDevice, u_plane_h.begin(), u_plane_h.end(), u_plane_d.begin());
Gpu::copyAsync(Gpu::hostToDevice, v_plane_h.begin(), v_plane_h.end(), v_plane_d.begin());
Expand Down
4 changes: 2 additions & 2 deletions Source/Utils/DirectionSelector.H
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ PerpendicularBox (const amrex::Box& bx, const amrex::IntVect& iv)
plane_hi = {bx.bigEnd(0), bx.bigEnd(1), iv[2]};
}

amrex::Box pbx(plane_lo, plane_hi);
amrex::Box pbx(plane_lo, plane_hi, bx.type());

return pbx;
}
Expand Down Expand Up @@ -95,7 +95,7 @@ ParallelBox (const amrex::Box& bx, const amrex::IntVect& iv)
line_hi = {iv[0], iv[1], bx.bigEnd(2)};
}

amrex::Box lbx(line_lo, line_hi);
amrex::Box lbx(line_lo, line_hi, bx.type());

return lbx;
}
Expand Down
26 changes: 17 additions & 9 deletions Source/Utils/PlaneAverage.H
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ PlaneAverage::PlaneAverage (const amrex::MultiFab* field_in,
m_ncell_line = dom_hi[m_axis] - dom_lo[m_axis] + 1 + m_ixtype[m_axis];

m_ncell_plane = 1;
auto period = m_geom.periodicity();
for (int i = 0; i < AMREX_SPACEDIM; ++i) {
if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + m_ixtype[m_axis]);
int p_fac = (!period.isPeriodic(i)) ? 1 : 0;
if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + p_fac*m_ixtype[i]);
}

m_line_average.resize(static_cast<size_t>(m_ncell_line) * m_ncomp, 0.0);
Expand Down Expand Up @@ -186,6 +188,8 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
amrex::Real* line_avg = lavg.data();
const int ncomp = m_ncomp;

amrex::Box domain = amrex::convert(m_geom.Domain(),m_ixtype);

amrex::IntVect ng = amrex::IntVect(0);
int offset = m_ng[m_axis];
if (m_inc_ghost) ng[m_axis] = offset;
Expand All @@ -196,29 +200,33 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
#endif
for (amrex::MFIter mfi(mfab, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
amrex::Box bx = mfi.tilebox(m_ixtype, ng);
amrex::Box pbx = PerpendicularBox<IndexSelector>(bx, amrex::IntVect(0));
amrex::Box tbx = mfi.tilebox();
if (tbx.smallEnd(m_axis) == domain.smallEnd(m_axis)) tbx.growLo(m_axis,offset);
if (tbx.bigEnd (m_axis) == domain.bigEnd (m_axis)) tbx.growHi(m_axis,offset);
amrex::Box pbx = PerpendicularBox<IndexSelector>(tbx, amrex::IntVect(0));

const amrex::Array4<const amrex::Real>& fab_arr = mfab.const_array(mfi);
const amrex::Array4<const int >& mask_arr = mask->const_array(mfi);

ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
amrex::ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
amrex::Gpu::Handler const& handler) noexcept
{
// Loop over the direction perpendicular to the plane.
// This reduces the atomic pressure on the destination arrays.
amrex::Box lbx = ParallelBox<IndexSelector>(bx, amrex::IntVect{p_i, p_j, p_k});
amrex::Box lbx = ParallelBox<IndexSelector>(tbx, amrex::IntVect{p_i, p_j, p_k});

for (int k = lbx.smallEnd(2); k <= lbx.bigEnd(2); ++k) {
for (int j = lbx.smallEnd(1); j <= lbx.bigEnd(1); ++j) {
for (int i = lbx.smallEnd(0); i <= lbx.bigEnd(0); ++i) {
int ind = idxOp.getIndx(i, j, k) + offset;
for (int n = 0; n < ncomp; ++n) {
if (mask_arr(i,j,k)) {
amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
fab_arr(i, j, k, n) * denom, handler);
}
// NOTE: This factor is to avoid an if statement that will break
// the devicereducesum since all threads won't participate.
// This more performant than Gpu::Atomic::Add.
amrex::Real fac = (mask_arr(i,j,k)) ? 1.0 : 0.0;
amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
fab_arr(i, j, k, n) * denom * fac, handler);
}
}
}
Expand Down

0 comments on commit fd1673c

Please sign in to comment.