Skip to content

Commit

Permalink
fix devicereducesum calls inside if statements.
Browse files Browse the repository at this point in the history
  • Loading branch information
Aaron Lattanzi committed Jul 31, 2024
1 parent 71431cc commit 1e60e77
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 24 deletions.
30 changes: 18 additions & 12 deletions Source/Diffusion/PBLModels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,15 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
AMREX_ASSERT_WITH_MESSAGE(qvel(i,j,k) > 0.0, "QKE must have a positive value");
AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");

if (sbx.contains(i,j,k)) {
const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz, handler);
}
// NOTE: This factor is to avoid an if statment that will break
// the devicereducesum since all threads won't participate.
// This more performent than Gpu::Atomic::Add.
Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
const Real Zval = Compute_Zrel_AtCellCenter(i,j,k,z_nd_arr);
const Real dz = Compute_h_zeta_AtCellCenter(i,j,k,invCellSize,z_nd_arr);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*dz*fac, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*dz*fac, handler);

});
} else {
ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
Expand All @@ -115,11 +118,14 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,
AMREX_ASSERT_WITH_MESSAGE(qvel_old(i,j,k) > 0.0, "Old QKE must have a positive value");

// Not multiplying by dz: its constant and would fall out when we divide qint0/qint1 anyway
if (sbx.contains(i,j,k)) {
const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k), handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k), handler);
}

// NOTE: This factor is to avoid an if statment that will break
// the devicereducesum since all threads won't participate.
// This more performent than Gpu::Atomic::Add.
Real fac = (sbx.contains(i,j,k)) ? 1.0 : 0.0;
const Real Zval = gdata.ProbLo(2) + (k + 0.5)*gdata.CellSize(2);
Gpu::deviceReduceSum(&qint(i,j,0,0), Zval*qvel(i,j,k)*fac, handler);
Gpu::deviceReduceSum(&qint(i,j,0,1), qvel(i,j,k)*fac, handler);
});
}

Expand Down Expand Up @@ -157,7 +163,7 @@ ComputeTurbulentViscosityPBL (const MultiFab& xvel,

// Spatially varying MOST
Real surface_heat_flux = -u_star_arr(i,j,0) * t_star_arr(i,j,0);
Real theta0 = tm_arr(i,j,0); // TODO: IS THIS ACTUALLY RHOTHETA
Real theta0 = tm_arr(i,j,0);
Real l_obukhov;
if (std::abs(surface_heat_flux) > eps) {
l_obukhov = ( theta0 * u_star_arr(i,j,0) * u_star_arr(i,j,0) ) /
Expand Down
4 changes: 2 additions & 2 deletions Source/SourceTerms/ERF_make_mom_sources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ void make_mom_sources (int /*level*/,
Gpu::HostVector< Real> u_plane_h(u_ncell), v_plane_h(v_ncell);
Gpu::DeviceVector< Real> u_plane_d(u_ncell), v_plane_d(v_ncell);

u_ave.line_average(0 , u_plane_h);
v_ave.line_average(0 , v_plane_h);
u_ave.line_average(0, u_plane_h);
v_ave.line_average(0, v_plane_h);

Gpu::copyAsync(Gpu::hostToDevice, u_plane_h.begin(), u_plane_h.end(), u_plane_d.begin());
Gpu::copyAsync(Gpu::hostToDevice, v_plane_h.begin(), v_plane_h.end(), v_plane_d.begin());
Expand Down
4 changes: 2 additions & 2 deletions Source/Utils/DirectionSelector.H
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ PerpendicularBox (const amrex::Box& bx, const amrex::IntVect& iv)
plane_hi = {bx.bigEnd(0), bx.bigEnd(1), iv[2]};
}

amrex::Box pbx(plane_lo, plane_hi);
amrex::Box pbx(plane_lo, plane_hi, bx.type());

return pbx;
}
Expand Down Expand Up @@ -95,7 +95,7 @@ ParallelBox (const amrex::Box& bx, const amrex::IntVect& iv)
line_hi = {iv[0], iv[1], bx.bigEnd(2)};
}

amrex::Box lbx(line_lo, line_hi);
amrex::Box lbx(line_lo, line_hi, bx.type());

return lbx;
}
Expand Down
20 changes: 12 additions & 8 deletions Source/Utils/PlaneAverage.H
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ PlaneAverage::PlaneAverage (const amrex::MultiFab* field_in,
m_ncell_line = dom_hi[m_axis] - dom_lo[m_axis] + 1 + m_ixtype[m_axis];

m_ncell_plane = 1;
auto period = m_geom.periodicity();
for (int i = 0; i < AMREX_SPACEDIM; ++i) {
if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + m_ixtype[m_axis]);
int p_fac = (!period.isPeriodic(i)) ? 1 : 0;
if (i != m_axis) m_ncell_plane *= (dom_hi[i] - dom_lo[i] + 1 + p_fac*m_ixtype[i]);
}

m_line_average.resize(static_cast<size_t>(m_ncell_line) * m_ncomp, 0.0);
Expand Down Expand Up @@ -202,9 +204,9 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
const amrex::Array4<const amrex::Real>& fab_arr = mfab.const_array(mfi);
const amrex::Array4<const int >& mask_arr = mask->const_array(mfi);

ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
amrex::Gpu::Handler const& handler) noexcept
amrex::ParallelFor(amrex::Gpu::KernelInfo().setReduction(true), pbx, [=]
AMREX_GPU_DEVICE( int p_i, int p_j, int p_k,
amrex::Gpu::Handler const& handler) noexcept
{
// Loop over the direction perpendicular to the plane.
// This reduces the atomic pressure on the destination arrays.
Expand All @@ -215,10 +217,12 @@ PlaneAverage::compute_averages (const IndexSelector& idxOp, const amrex::MultiFa
for (int i = lbx.smallEnd(0); i <= lbx.bigEnd(0); ++i) {
int ind = idxOp.getIndx(i, j, k) + offset;
for (int n = 0; n < ncomp; ++n) {
if (mask_arr(i,j,k)) {
amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
fab_arr(i, j, k, n) * denom, handler);
}
// NOTE: This factor is to avoid an if statment that will break
// the devicereducesum since all threads won't participate.
// This more performent than Gpu::Atomic::Add.
amrex::Real fac = (mask_arr(i,j,k)) ? 1.0 : 0.0;
amrex::Gpu::deviceReduceSum(&line_avg[ncomp * ind + n],
fab_arr(i, j, k, n) * denom * fac, handler);
}
}
}
Expand Down

0 comments on commit 1e60e77

Please sign in to comment.