diff --git a/Docs/sphinx_doc/TimeAdvance.rst b/Docs/sphinx_doc/TimeAdvance.rst
index e6d12ef33..872c41409 100644
--- a/Docs/sphinx_doc/TimeAdvance.rst
+++ b/Docs/sphinx_doc/TimeAdvance.rst
@@ -135,8 +135,14 @@ Then the acoustic substepping evolves the equations in the form
           - \frac{\partial (\beta_1 W^{\prime \prime, \tau} + \beta_2 W^{\prime \prime, \tau + \delta \tau})}{\partial z} +  R^t_{\rho}
             \right)
 
-where :math:`\beta_1 = 0.5 (1 - \beta_s)` and :math:`\beta_2 = 0.5 (1 + \beta_s)` with :math:`\beta_s = 0.1`.
-:math:`\beta_s` is the acoustic step off-centering coefficient and 0.1 is the typical WRF value. This off-centering is intended to provide damping of both horizontally and vertically propagating sound waves by biasing the time average toward the future time step.
+where :math:`\beta_1 = 0.5 (1 - \beta_s)` and :math:`\beta_2 = 0.5 (1 + \beta_s)`.
+
+:math:`\beta_s` is the acoustic step off-centering coefficient.  When we do implicit substepping, we use
+the typical WRF value of 0.1. This off-centering is intended to provide damping of both horizontally
+and vertically propagating sound waves by biasing the time average toward the future time step.
+
+When we do fully explicit substepping, we set :math:`\beta_s = -1.0`, which sets
+:math:`\beta_1 = 1` and :math:`\beta_2 = 0`.
 
 To solve the coupled system, we first evolve the equations for :math:`U^{\prime \prime, \tau + \delta \tau}`  and
 :math:`V^{\prime \prime, \tau + \delta \tau}` explicitly using :math:`\Theta^{\prime \prime, \tau}` which is already known.
@@ -149,10 +155,10 @@ to control horizontally propagating sound waves.
 .. math::
 
    p^{\prime\prime,\tau*} = p^{\prime\prime,\tau}
-     + \beta_d \left( p^{\prime\prime,\tau} + p^{\prime\prime,\tau-\delta\tau} \right)
+     + \beta_d \left( p^{\prime\prime,\tau} - p^{\prime\prime,\tau-\delta\tau} \right)
 
 where :math:`\tau*` is the forward projected value used in RHS of the acoustic
 substepping equations for horizontal momentum. According to Skamarock et al,
-This is equivalent to including a horizontal diffusion term in the continuity
+this is equivalent to including a horizontal diffusion term in the continuity
 equation. A typical damping coefficient of :math:`\beta_d = 0.1` is used, as in
 WRF.
diff --git a/Exec/CMakeLists.txt b/Exec/CMakeLists.txt
index fd00cc297..a5b384ebd 100644
--- a/Exec/CMakeLists.txt
+++ b/Exec/CMakeLists.txt
@@ -20,7 +20,6 @@ elseif (ERF_ENABLE_REGRESSION_TESTS_ONLY)
 else ()
   add_subdirectory(ABL)
   add_subdirectory(SuperCell)
-  add_subdirectory(Radiation)
   add_subdirectory(SquallLine_2D)
   add_subdirectory(RegTests/Bubble)
   add_subdirectory(RegTests/Couette_Poiseuille)
@@ -35,9 +34,10 @@ else ()
   add_subdirectory(RegTests/WPS_Test)
   add_subdirectory(RegTests/Bomex)
   add_subdirectory(RegTests/TurbulentInflow)
-  add_subdirectory(DevTests/MovingTerrain)
-  add_subdirectory(DevTests/MetGrid)
   add_subdirectory(DevTests/LandSurfaceModel)
+  add_subdirectory(DevTests/MetGrid)
+  add_subdirectory(DevTests/MovingTerrain)
+  add_subdirectory(DevTests/Radiation)
   add_subdirectory(DevTests/TemperatureSource)
   add_subdirectory(DevTests/TropicalCyclone)
 endif()
diff --git a/Exec/Radiation/CMakeLists.txt b/Exec/DevTests/Radiation/CMakeLists.txt
similarity index 100%
rename from Exec/Radiation/CMakeLists.txt
rename to Exec/DevTests/Radiation/CMakeLists.txt
diff --git a/Exec/Radiation/ERF_prob.H b/Exec/DevTests/Radiation/ERF_prob.H
similarity index 100%
rename from Exec/Radiation/ERF_prob.H
rename to Exec/DevTests/Radiation/ERF_prob.H
diff --git a/Exec/Radiation/ERF_prob.cpp b/Exec/DevTests/Radiation/ERF_prob.cpp
similarity index 100%
rename from Exec/Radiation/ERF_prob.cpp
rename to Exec/DevTests/Radiation/ERF_prob.cpp
diff --git a/Exec/Radiation/GNUmakefile b/Exec/DevTests/Radiation/GNUmakefile
similarity index 100%
rename from Exec/Radiation/GNUmakefile
rename to Exec/DevTests/Radiation/GNUmakefile
diff --git a/Exec/Radiation/Make.package b/Exec/DevTests/Radiation/Make.package
similarity index 100%
rename from Exec/Radiation/Make.package
rename to Exec/DevTests/Radiation/Make.package
diff --git a/Exec/Radiation/README b/Exec/DevTests/Radiation/README
similarity index 100%
rename from Exec/Radiation/README
rename to Exec/DevTests/Radiation/README
diff --git a/Exec/Radiation/inputs_radiation b/Exec/DevTests/Radiation/inputs_radiation
similarity index 100%
rename from Exec/Radiation/inputs_radiation
rename to Exec/DevTests/Radiation/inputs_radiation
diff --git a/Source/TimeIntegration/ERF_MRI.H b/Source/TimeIntegration/ERF_MRI.H
index 58302b1a9..0a38cc63c 100644
--- a/Source/TimeIntegration/ERF_MRI.H
+++ b/Source/TimeIntegration/ERF_MRI.H
@@ -273,6 +273,10 @@ public:
                     nsubsteps =             1;   dtau =     timestep;
                 } else {
                     nsubsteps = substep_ratio;   dtau = sub_timestep;
+
+                    // STRT HACK -- this hack can be used to approximate the no-substepping algorithm
+                    // nsubsteps = 1;   dtau = timestep;
+                    // END HACK
                 }
                 time_stage = time + timestep;
             }
diff --git a/Source/TimeIntegration/ERF_TI_fast_headers.H b/Source/TimeIntegration/ERF_TI_fast_headers.H
index 4504db371..5c470d9da 100644
--- a/Source/TimeIntegration/ERF_TI_fast_headers.H
+++ b/Source/TimeIntegration/ERF_TI_fast_headers.H
@@ -37,7 +37,8 @@ void erf_fast_rhs_N (int step, int nrk, int level, int finest_level,
                      std::unique_ptr<amrex::MultiFab>& mapfac_v,
                      amrex::YAFluxRegister* fr_as_crse,
                      amrex::YAFluxRegister* fr_as_fine,
-                     bool l_use_moisture, bool l_reflux);
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
 
 /**
  * Function for computing the fast RHS with fixed terrain
@@ -64,7 +65,8 @@ void erf_fast_rhs_T (int step, int nrk, int level, int finest_level,
                      std::unique_ptr<amrex::MultiFab>& mapfac_v,
                      amrex::YAFluxRegister* fr_as_crse,
                      amrex::YAFluxRegister* fr_as_fine,
-                     bool l_use_moisture, bool l_reflux);
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
 
 /**
  * Function for computing the fast RHS with moving terrain
@@ -98,7 +100,8 @@ void erf_fast_rhs_MT (int step, int nrk, int level, int finest_level,
                       std::unique_ptr<amrex::MultiFab>& mapfac_v,
                       amrex::YAFluxRegister* fr_as_crse,
                       amrex::YAFluxRegister* fr_as_fine,
-                      bool l_use_moisture, bool l_reflux);
+                      bool l_use_moisture, bool l_reflux,
+                      bool l_implicit_substepping);
 
 /**
  * Function for computing the coefficients for the tridiagonal solver used in the fast
diff --git a/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H b/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
index f61dba333..c3fa90a70 100644
--- a/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
+++ b/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
@@ -20,7 +20,13 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
         // Per p2902 of Klemp-Skamarock-Dudhia-2007
         // beta_s = -1.0 : fully explicit
         // beta_s =  1.0 : fully implicit
-        Real beta_s = 0.1;
+        Real beta_s;
+        if (solverChoice.substepping_type[level] == SubsteppingType::Implicit)
+        {
+            beta_s = 0.1;
+        } else { // Fully explicit
+            beta_s = -1.0;
+        }
 
         // *************************************************************************
         // Set up flux registers if using two_way coupling
@@ -99,7 +105,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
                                 mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
                 // If this is not the first substep we pass in S_data as the previous step's solution
                 erf_fast_rhs_MT(fast_step, nrk, level, finest_level,
@@ -111,7 +117,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
                                 mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         } else if (solverChoice.use_terrain && solverChoice.terrain_type == TerrainType::Static) {
             if (fast_step == 0) {
@@ -127,7 +133,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
                 // If this is not the first substep we pass in S_data as the previous step's solution
                 erf_fast_rhs_T(fast_step, nrk, level, finest_level,
@@ -135,7 +141,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         } else {
             if (fast_step == 0) {
@@ -151,7 +157,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                S_data, S_scratch, fine_geom, solverChoice.gravity,
                                dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
                 // If this is not the first substep we pass in S_data as the previous step's solution
                 erf_fast_rhs_N(fast_step, nrk, level, finest_level,
@@ -159,7 +165,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                S_data, S_scratch, fine_geom, solverChoice.gravity,
                                dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         }
 
diff --git a/Source/TimeIntegration/ERF_advance_dycore.cpp b/Source/TimeIntegration/ERF_advance_dycore.cpp
index 4761c8646..8c78fcfb3 100644
--- a/Source/TimeIntegration/ERF_advance_dycore.cpp
+++ b/Source/TimeIntegration/ERF_advance_dycore.cpp
@@ -90,6 +90,7 @@ void ERF::advance_dycore(int level,
     bool l_use_kturb   = ( (tc.les_type != LESType::None)   ||
                            (tc.pbl_type != PBLType::None) );
     bool l_use_moisture = ( solverChoice.moisture_type != MoistureType::None );
+    bool l_implicit_substepping = ( solverChoice.substepping_type[level] == SubsteppingType::Implicit );
 
     const bool use_most = (m_most != nullptr);
     const bool exp_most = (solverChoice.use_explicit_most);
diff --git a/Source/TimeIntegration/ERF_fast_rhs_MT.cpp b/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
index 1ff3b524b..1843a7d8f 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
@@ -7,38 +7,40 @@ using namespace amrex;
  * Function for computing the fast RHS with moving terrain
  *
  * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stg_data solution            at previous RK stage
- * @param[in]    S_stg_prim primitive variables at previous RK stage
- * @param[in]    pi_stage   Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity Magnitude of gravity
- * @param[in]    use_lagged_delta_rt define lagged_delta_rt for our next step
- * @param[in]    Omega component of the momentum normal to the z-coordinate surface
- * @param[in]    z_t_rk rate of change of grid height -- only relevant for moving terrain
- * @param[in]    z_t_pert rate of change of grid height -- interpolated between RK stages
- * @param[in]    z_phys_nd_old height coordinate at nodes at old time
- * @param[in]    z_phys_nd_new height coordinate at nodes at new time
- * @param[in]    z_phys_nd_stg height coordinate at nodes at previous stage
- * @param[in]    detJ_cc_old Jacobian of the metric transformation at old time
- * @param[in]    detJ_cc_new Jacobian of the metric transformation at new time
- * @param[in]    detJ_cc_stg Jacobian of the metric transformation at previous stage
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
+ * @param[in   ] nrk   which Runge-Kutta step
+ * @param[in   ] level level of resolution
+ * @param[in   ] finest_level finest level of resolution
+ * @param[in   ] S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ] S_prev previous solution
+ * @param[in   ] S_stg_data solution            at previous RK stage
+ * @param[in   ] S_stg_prim primitive variables at previous RK stage
+ * @param[in   ] pi_stage   Exner function      at previous RK stage
+ * @param[in   ] fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out] S_data current solution
+ * @param[in   ] S_scratch scratch space
+ * @param[in   ] geom container for geometric information
+ * @param[in   ] gravity Magnitude of gravity
+ * @param[in   ] use_lagged_delta_rt define lagged_delta_rt for our next step
+ * @param[in   ] Omega component of the momentum normal to the z-coordinate surface
+ * @param[in   ] z_t_rk rate of change of grid height -- only relevant for moving terrain
+ * @param[in   ] z_t_pert rate of change of grid height -- interpolated between RK stages
+ * @param[in   ] z_phys_nd_old height coordinate at nodes at old time
+ * @param[in   ] z_phys_nd_new height coordinate at nodes at new time
+ * @param[in   ] z_phys_nd_stg height coordinate at nodes at previous stage
+ * @param[in   ] detJ_cc_old Jacobian of the metric transformation at old time
+ * @param[in   ] detJ_cc_new Jacobian of the metric transformation at new time
+ * @param[in   ] detJ_cc_stg Jacobian of the metric transformation at previous stage
+ * @param[in   ] dtau fast time step
+ * @param[in   ] beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ] facinv inverse factor for time-averaging the momenta
+ * @param[in   ] mapfac_m map factor at cell centers
+ * @param[in   ] mapfac_u map factor at x-faces
+ * @param[in   ] mapfac_v map factor at y-faces
  * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
  * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_use_moisture
+ * @param[in   ]  l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_implicit_substepping
  */
 
 void erf_fast_rhs_MT (int step, int nrk,
@@ -71,7 +73,8 @@ void erf_fast_rhs_MT (int step, int nrk,
                       YAFluxRegister* fr_as_crse,
                       YAFluxRegister* fr_as_fine,
                       bool l_use_moisture,
-                      bool l_reflux)
+                      bool l_reflux,
+                      bool /*l_implicit_substepping*/)
 {
     BL_PROFILE_REGION("erf_fast_rhs_MT()");
 
diff --git a/Source/TimeIntegration/ERF_fast_rhs_N.cpp b/Source/TimeIntegration/ERF_fast_rhs_N.cpp
index fec38e38e..5bafd34bf 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_N.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_N.cpp
@@ -6,39 +6,41 @@ using namespace amrex;
 /**
  * Function for computing the fast RHS with no terrain
  *
- * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stage_data solution            at previous RK stage
- * @param[in]    S_stage_prim primitive variables at previous RK stage
- * @param[in]    pi_stage   Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity magnitude of gravity
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
- * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
- * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  step  which fast time step within each Runge-Kutta step
+ * @param[in   ]  nrk   which Runge-Kutta step
+ * @param[in   ]  level level of resolution
+ * @param[in   ]  finest_level finest level of resolution
+ * @param[in   ]  S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ]  S_prev                           if step == 0, this is S_old, else the previous fast solution
+ * @param[in   ]  S_stage_data solution            at previous RK stage
+ * @param[in   ]  S_stage_prim primitive variables at previous RK stage
+ * @param[in   ]  pi_stage   Exner function      at previous RK stage
+ * @param[in   ]  fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out]  S_data    current solution
+ * @param[in   ]  S_scratch scratch space
+ * @param[in   ]  geom container for geometric information
+ * @param[in   ]  gravity magnitude of gravity
+ * @param[in   ]  dtau fast time step
+ * @param[in   ]  beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ]  facinv inverse factor for time-averaging the momenta
+ * @param[in   ]  mapfac_m map factor at cell centers
+ * @param[in   ]  mapfac_u map factor at x-faces
+ * @param[in   ]  mapfac_v map factor at y-faces
+ * @param[inout]  fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
+ * @param[inout]  fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
+ * @param[in   ]  l_use_moisture
+ * @param[in   ]  l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_implicit_substepping
  */
 
 void erf_fast_rhs_N (int step, int nrk,
                      int level, int finest_level,
                      Vector<MultiFab>& S_slow_rhs,                   // the slow RHS already computed
                      const Vector<MultiFab>& S_prev,                 // if step == 0, this is S_old, else the previous solution
-                     Vector<MultiFab>& S_stage_data,                 // S_bar = S^n, S^* or S^**
-                     const MultiFab& S_stage_prim,                   // Primitive version of S_stage_data[IntVars::cons]
-                     const MultiFab& pi_stage,                       // Exner function evaluated at last stage
-                     const MultiFab& fast_coeffs,                    // Coeffs for tridiagonal solve
+                     Vector<MultiFab>& S_stage_data,                 // S_stage = S^n, S^* or S^**
+                     const  MultiFab & S_stage_prim,                 // Primitive version of S_stage_data[IntVars::cons]
+                     const  MultiFab & pi_stage,                     // Exner function evaluated at last stage
+                     const  MultiFab &fast_coeffs,                   // Coeffs for tridiagonal solve
                      Vector<MultiFab>& S_data,                       // S_sum = most recent full solution
                      Vector<MultiFab>& S_scratch,                    // S_sum_old at most recent fast timestep for (rho theta)
                      const Geometry geom,
@@ -51,8 +53,13 @@ void erf_fast_rhs_N (int step, int nrk,
                      YAFluxRegister* fr_as_crse,
                      YAFluxRegister* fr_as_fine,
                      bool l_use_moisture,
-                     bool l_reflux)
+                     bool l_reflux,
+                     bool l_implicit_substepping)
 {
+    //
+    // NOTE: for step > 0, S_data and S_prev point to the same MultiFab data!!
+    //
+
     BL_PROFILE_REGION("erf_fast_rhs_N()");
 
     Real beta_1 = 0.5 * (1.0 - beta_s);  // multiplies explicit terms
@@ -71,9 +78,8 @@ void erf_fast_rhs_N (int step, int nrk,
     const auto& ba = S_stage_data[IntVars::cons].boxArray();
     const auto& dm = S_stage_data[IntVars::cons].DistributionMap();
 
-    MultiFab Delta_rho_w(    convert(ba,IntVect(0,0,1)), dm, 1, IntVect(1,1,0));
-    MultiFab Delta_rho  (            ba                , dm, 1, 1);
     MultiFab Delta_rho_theta(        ba                , dm, 1, 1);
+    MultiFab Delta_rho_w    (convert(ba,IntVect(0,0,1)), dm, 1, IntVect(1,1,0));
 
     MultiFab     coeff_A_mf(fast_coeffs, make_alias, 0, 1);
     MultiFab inv_coeff_B_mf(fast_coeffs, make_alias, 1, 1);
@@ -96,6 +102,9 @@ void erf_fast_rhs_N (int step, int nrk,
     MultiFab temp_cur_xmom(S_stage_data[IntVars::xmom].boxArray(),S_stage_data[IntVars::xmom].DistributionMap(),1,0);
     MultiFab temp_cur_ymom(S_stage_data[IntVars::ymom].boxArray(),S_stage_data[IntVars::ymom].DistributionMap(),1,0);
 
+    // We assume that in the first step (nrk == 0) we are only doing one substep.
+    AMREX_ALWAYS_ASSERT(nrk > 0 || step == 0);
+
     // *************************************************************************
     // First set up some arrays we'll need
     // *************************************************************************
@@ -105,61 +114,40 @@ void erf_fast_rhs_N (int step, int nrk,
 #endif
     for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
-        const Array4<Real>       & cur_cons  = S_data[IntVars::cons].array(mfi);
         const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
-        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
-        const Array4<Real>& lagged_delta_rt  = S_scratch[IntVars::cons].array(mfi);
-
-        const Array4<Real>& old_drho       = Delta_rho.array(mfi);
-        const Array4<Real>& old_drho_w     = Delta_rho_w.array(mfi);
-        const Array4<Real>& old_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<const Real>& prev_zmom =  S_prev[IntVars::zmom].const_array(mfi);
 
-        const Array4<const Real>&  prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
         const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
 
-        Box gbx = mfi.tilebox(); gbx.grow(1);
-
-        if (step == 0) {
-            ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-            {
-                cur_cons(i,j,k,Rho_comp)      = prev_cons(i,j,k,Rho_comp);
-                cur_cons(i,j,k,RhoTheta_comp) = prev_cons(i,j,k,RhoTheta_comp);
-            });
-        } // step = 0
-
-        Box gtbz = mfi.nodaltilebox(2);
-        gtbz.grow(IntVect(1,1,0));
-        ParallelFor(gtbz, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            old_drho_w(i,j,k) = prev_zmom(i,j,k) - stage_zmom(i,j,k);
-        });
-
+        const Array4<Real>& prev_drho_w     = Delta_rho_w.array(mfi);
+        const Array4<Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<Real>& lagged_delta_rt = S_scratch[IntVars::cons].array(mfi);
         const Array4<Real>& theta_extrap = extrap.array(mfi);
-        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            old_drho(i,j,k)       = cur_cons(i,j,k,Rho_comp)      - stage_cons(i,j,k,Rho_comp);
-            old_drho_theta(i,j,k) = cur_cons(i,j,k,RhoTheta_comp) - stage_cons(i,j,k,RhoTheta_comp);
+
+        Box gbx = mfi.growntilebox(1);
+        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            prev_drho_theta(i,j,k) = prev_cons(i,j,k,RhoTheta_comp) - stage_cons(i,j,k,RhoTheta_comp);
 
             if (step == 0) {
-                theta_extrap(i,j,k) = old_drho_theta(i,j,k);
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k);
             } else {
-                theta_extrap(i,j,k) = old_drho_theta(i,j,k) + beta_d *
-                  ( old_drho_theta(i,j,k) - lagged_delta_rt(i,j,k,RhoTheta_comp) );
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k) + beta_d *
+                  ( prev_drho_theta(i,j,k) - lagged_delta_rt(i,j,k,RhoTheta_comp) );
             }
-        });
-    } // mfi
-
-#ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-    for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
-    {
-        // We define lagged_delta_rt for our next step as the current delta_rt
-        Box gbx = mfi.tilebox(); gbx.grow(1);
 
-        const Array4<Real>& lagged_delta_rt = S_scratch[IntVars::cons].array(mfi);
-        const Array4<Real>& old_drho_theta  = Delta_rho_theta.array(mfi);
+            // We define lagged_delta_rt for our next step as the current delta_rt
+            // (after using it above to extrapolate theta for this step)
+            lagged_delta_rt(i,j,k,RhoTheta_comp) = prev_drho_theta(i,j,k);
+        });
 
-        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            lagged_delta_rt(i,j,k,RhoTheta_comp) = old_drho_theta(i,j,k);
+        // NOTE: We must do this here because for step > 0, prev_zmom and cur_zmom both point to the same data,
+        //       so by the time we would use prev_zmom to define zflux, it would have already been over-written.
+        Box gtbz = mfi.nodaltilebox(2);
+        gtbz.grow(IntVect(1,1,0));
+        ParallelFor(gtbz, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
+            prev_drho_w(i,j,k) = prev_zmom(i,j,k) - stage_zmom(i,j,k);
         });
     } // mfi
 
@@ -203,56 +191,69 @@ void erf_fast_rhs_N (int step, int nrk,
         // *********************************************************************
         // Define updates in the RHS of {x, y, z}-momentum equations
         // *********************************************************************
-        {
-        BL_PROFILE("fast_rhs_xymom");
-        ParallelFor(tbx, tby,
-        [=] AMREX_GPU_DEVICE (int i, int j, int k)
-        {
-            // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
-            Real gpx = (theta_extrap(i,j,k) - theta_extrap(i-1,j,k))*dxi;
-            gpx *= mf_u(i,j,0);
-
-            if (l_use_moisture) {
-                Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i-1,j,k,PrimQ1_comp)
-                                +prim(i,j,k,PrimQ2_comp) + prim(i-1,j,k,PrimQ2_comp) );
-                gpx /= (1.0 + q);
-            }
-
-            Real pi_c =  0.5 * (pi_stage_ca(i-1,j,k,0) + pi_stage_ca(i,j,k,0));
+        if (nrk == 0 and step == 0) {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k) + dtau * slow_rhs_rho_u(i,j,k);
+                avg_xmom(i,j,k) += facinv*new_drho_u;
+                temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k) + dtau * slow_rhs_rho_v(i,j,k);
+                avg_ymom(i,j,k) += facinv*new_drho_v;
+                temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
+            });
+        } else {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpx = (theta_extrap(i,j,k) - theta_extrap(i-1,j,k))*dxi;
+                gpx *= mf_u(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i-1,j,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i-1,j,k,PrimQ2_comp) );
+                    gpx /= (1.0 + q);
+                }
 
-            Real fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
+                Real pi_c =  0.5 * (pi_stage_ca(i-1,j,k,0) + pi_stage_ca(i,j,k,0));
 
-            Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k)
-                + dtau * fast_rhs_rho_u + dtau * slow_rhs_rho_u(i,j,k);
+                Real fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
 
-            avg_xmom(i,j,k) += facinv*new_drho_u;
+                Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k)
+                    + dtau * fast_rhs_rho_u + dtau * slow_rhs_rho_u(i,j,k);
 
-            temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
-        },
-        [=] AMREX_GPU_DEVICE (int i, int j, int k)
-        {
-            // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
-            Real gpy = (theta_extrap(i,j,k) - theta_extrap(i,j-1,k))*dyi;
-            gpy *= mf_v(i,j,0);
+                avg_xmom(i,j,k) += facinv*new_drho_u;
 
-            if (l_use_moisture) {
-                Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j-1,k,PrimQ1_comp)
-                                +prim(i,j,k,PrimQ2_comp) + prim(i,j-1,k,PrimQ2_comp) );
-                gpy /= (1.0 + q);
-            }
+                temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpy = (theta_extrap(i,j,k) - theta_extrap(i,j-1,k))*dyi;
+                gpy *= mf_v(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j-1,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i,j-1,k,PrimQ2_comp) );
+                    gpy /= (1.0 + q);
+                }
 
-            Real pi_c =  0.5 * (pi_stage_ca(i,j-1,k,0) + pi_stage_ca(i,j,k,0));
+                Real pi_c =  0.5 * (pi_stage_ca(i,j-1,k,0) + pi_stage_ca(i,j,k,0));
 
-            Real fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
+                Real fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
 
-            Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k)
-                 + dtau * fast_rhs_rho_v + dtau * slow_rhs_rho_v(i,j,k);
+                Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k)
+                     + dtau * fast_rhs_rho_v + dtau * slow_rhs_rho_v(i,j,k);
 
-            avg_ymom(i,j,k) += facinv*new_drho_v;
+                avg_ymom(i,j,k) += facinv*new_drho_v;
 
-            temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
-        });
-        } //profile
+                temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
+            });
+        } // nrk > 0 and/or step > 0
     } //mfi
 
 #ifdef _OPENMP
@@ -268,24 +269,25 @@ void erf_fast_rhs_N (int step, int nrk,
         Box vbx = mfi.validbox();
         const auto& vbx_hi = ubound(vbx);
 
-        const Array4<const Real> & stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
-        const Array4<const Real> & stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
-        const Array4<const Real> & stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
+        const Array4<const Real>& stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
+        const Array4<const Real>& stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
+        const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
         const Array4<const Real> & prim       = S_stage_prim.const_array(mfi);
 
-        const Array4<Real>& old_drho_w     = Delta_rho_w.array(mfi);
-        const Array4<Real>& old_drho       = Delta_rho.array(mfi);
-        const Array4<Real>& old_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<const Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+
+        const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
 
         const Array4<const Real>& slow_rhs_cons  = S_slow_rhs[IntVars::cons].const_array(mfi);
         const Array4<const Real>& slow_rhs_rho_w = S_slow_rhs[IntVars::zmom].const_array(mfi);
 
-        const Array4<Real>& cur_zmom       = S_data[IntVars::zmom].array(mfi);
+        const Array4<const Real>& prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
+        const Array4<      Real>&  cur_zmom = S_data[IntVars::zmom].array(mfi);
 
         const Array4<Real>& temp_cur_xmom_arr  = temp_cur_xmom.array(mfi);
         const Array4<Real>& temp_cur_ymom_arr  = temp_cur_ymom.array(mfi);
 
-        const Array4<const Real>& prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
 
         // These store the advection momenta which we will use to update the slow variables
         const Array4<      Real>& avg_zmom = S_scratch[IntVars::zmom].array(mfi);
@@ -368,8 +370,6 @@ void erf_fast_rhs_N (int step, int nrk,
         // *********************************************************************
         // fast_loop_on_shrunk
         // *********************************************************************
-        {
-        BL_PROFILE("fast_loop_on_shrunk");
         //Note we don't act on the bottom or top boundaries of the domain
         ParallelFor(bx_shrunk_in_k, [=] AMREX_GPU_DEVICE (int i, int j, int k)
         {
@@ -392,8 +392,10 @@ void erf_fast_rhs_N (int step, int nrk,
             Real Omega_km1 = prev_zmom(i,j,k-1) - stage_zmom(i,j,k-1);
 
             // line 2 last two terms (order dtau)
-            Real R0_tmp = coeff_P * old_drho_theta(i,j,k) + coeff_Q * old_drho_theta(i,j,k-1)
-                         - halfg * ( old_drho(i,j,k) + old_drho(i,j,k-1) );
+            Real old_drho_k   = prev_cons(i,j,k  ,Rho_comp) - stage_cons(i,j,k  ,Rho_comp);
+            Real old_drho_km1 = prev_cons(i,j,k-1,Rho_comp) - stage_cons(i,j,k-1,Rho_comp);
+            Real R0_tmp = coeff_P * prev_drho_theta(i,j,k) + coeff_Q * prev_drho_theta(i,j,k-1)
+                         - halfg * ( old_drho_k + old_drho_km1 );
 
             // lines 3-5 residuals (order dtau^2) 1.0 <-> beta_2
             Real R1_tmp =  halfg * (-slow_rhs_cons(i,j,k  ,Rho_comp)
@@ -409,94 +411,114 @@ void erf_fast_rhs_N (int step, int nrk,
 
             // line 1
             RHS_a(i,j,k) = Omega_k + dtau * (slow_rhs_rho_w(i,j,k) + R0_tmp + dtau * beta_2 * R1_tmp);
-        });
-        } // end profile
+
+        }); // bx_shrunk_in_k
 
         Box b2d = tbz; // Copy constructor
         b2d.setRange(2,0);
 
-        {
-        BL_PROFILE("fast_rhs_b2d_loop");
-#ifdef AMREX_USE_GPU
         auto const lo = lbound(bx);
         auto const hi = ubound(bx);
+
         ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
         {
           // w at bottom boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
-          RHS_a(i,j,lo.z) = dtau * slow_rhs_rho_w(i,j,lo.z);
+          RHS_a (i,j,lo.z  ) = prev_zmom(i,j,lo.z  ) - stage_zmom(i,j,lo.z)
+                             + dtau * slow_rhs_rho_w(i,j,lo.z);
 
           // w at top boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
-          // TODO TODO: Note that if we ever change this, we will need to include it in avg_zmom at the top
-          RHS_a(i,j,hi.z+1) = dtau * slow_rhs_rho_w(i,j,hi.z+1);
+          RHS_a (i,j,hi.z+1) = prev_zmom(i,j,hi.z+1) - stage_zmom(i,j,hi.z+1)
+                             + dtau * slow_rhs_rho_w(i,j,hi.z+1);
+        }); // b2d
 
-          // w = specified Dirichlet value at k = lo.z
-            soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
-          cur_zmom(i,j,lo.z) = stage_zmom(i,j,lo.z) + soln_a(i,j,lo.z);
+#ifdef AMREX_USE_GPU
+        if (l_implicit_substepping) {
 
-          for (int k = lo.z+1; k <= hi.z+1; k++) {
-              soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
-          }
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              // w = specified Dirichlet value at k = lo.z
+                soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
+              cur_zmom(i,j,lo.z) = stage_zmom(i,j,lo.z) + soln_a(i,j,lo.z);
 
-          cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+              for (int k = lo.z+1; k <= hi.z+1; k++) {
+                  soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+              }
 
-          for (int k = hi.z; k >= lo.z; k--) {
-              soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) *soln_a(i,j,k+1);
-              cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
-          }
-        }); // b2d
+              cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+
+              for (int k = hi.z; k >= lo.z; k--) {
+                  soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) *soln_a(i,j,k+1);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              for (int k = lo.z; k <= hi.z+1; k++) {
+                    soln_a(i,j,k) = RHS_a(i,j,k);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+        } // end of explicit substepping
 #else
-        auto const lo = lbound(bx);
-        auto const hi = ubound(bx);
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                // w at bottom boundary of grid is 0 if at domain boundary, otherwise w_old + dtau * slow_rhs
-                RHS_a (i,j,lo.z) = dtau * slow_rhs_rho_w(i,j,lo.z);
-                soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
-            }
-        }
-        // Note that if we ever change this, we will need to include it in avg_zmom at the top
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                RHS_a (i,j,hi.z+1) = dtau * slow_rhs_rho_w(i,j,hi.z+1);
-            }
-        }
-        for (int k = lo.z+1; k <= hi.z+1; ++k) {
+        if (l_implicit_substepping) {
+
             for (int j = lo.y; j <= hi.y; ++j) {
                 AMREX_PRAGMA_SIMD
                 for (int i = lo.x; i <= hi.x; ++i) {
-                    soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+                    soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
                 }
             }
-        }
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+            for (int k = lo.z+1; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+                    }
+                }
             }
-        }
-        for (int k = hi.z; k >= lo.z; --k) {
             for (int j = lo.y; j <= hi.y; ++j) {
                 AMREX_PRAGMA_SIMD
                 for (int i = lo.x; i <= hi.x; ++i) {
-                    soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) * soln_a(i,j,k+1);
-                    cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
                 }
             }
-        }
+            for (int k = hi.z; k >= lo.z; --k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) * soln_a(i,j,k+1);
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            for (int k = lo.z; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+
+                        soln_a(i,j,k) = RHS_a(i,j,k);
+
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+
+        } // end of explicit substepping
 #endif
-        } // end profile
 
         // **************************************************************************
         // Define updates in the RHS of rho and (rho theta)
         // **************************************************************************
-        {
-        BL_PROFILE("fast_rho_final_update");
+        const Array4<Real>&  prev_drho_w = Delta_rho_w.array(mfi);
         ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
         {
-            Real zflux_lo = beta_2 * soln_a(i,j,k  ) + beta_1 * old_drho_w(i,j,k  );
-            Real zflux_hi = beta_2 * soln_a(i,j,k+1) + beta_1 * old_drho_w(i,j,k+1);
+            Real zflux_lo = beta_2 * soln_a(i,j,k  ) + beta_1 * prev_drho_w(i,j,k  );
+            Real zflux_hi = beta_2 * soln_a(i,j,k+1) + beta_1 * prev_drho_w(i,j,k+1);
 
             avg_zmom(i,j,k)      += facinv*zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
             (flx_arr[2])(i,j,k,0) =        zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
@@ -512,7 +534,6 @@ void erf_fast_rhs_N (int step, int nrk,
             temp_rhs_arr(i,j,k,RhoTheta_comp) += 0.5 * dzi * ( zflux_hi * (prim(i,j,k) + prim(i,j,k+1))
                                                              - zflux_lo * (prim(i,j,k) + prim(i,j,k-1)) );
         });
-        } // end profile
 
         // We only add to the flux registers in the final RK step
         if (l_reflux && nrk == 2) {
@@ -546,15 +567,29 @@ void erf_fast_rhs_N (int step, int nrk,
     {
         const Box& bx = mfi.tilebox();
 
-        int cons_dycore{2};
-        const Array4<Real>& cur_cons = S_data[IntVars::cons].array(mfi);
+        const Array4<      Real>&  cur_cons = S_data[IntVars::cons].array(mfi);
+        const Array4<const Real>& prev_cons = S_prev[IntVars::cons].const_array(mfi);
         auto const& temp_rhs_arr     = temp_rhs.const_array(mfi);
         auto const& slow_rhs_cons    = S_slow_rhs[IntVars::cons].const_array(mfi);
 
-        ParallelFor(bx, cons_dycore, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
-        {
-            cur_cons(i,j,k,n) += dtau * (slow_rhs_cons(i,j,k,n) - temp_rhs_arr(i,j,k,n));
-        });
+        if (step == 0) {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                cur_cons(i,j,k,Rho_comp)      = prev_cons(i,j,k,Rho_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) = prev_cons(i,j,k,RhoTheta_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } else {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                //
+                // We didn't need to set cur_cons = prev_cons above because they point to the same data for step > 0
+                //
+                cur_cons(i,j,k,Rho_comp)      += dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) += dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } // step = 0
 
         const Array4<Real>& cur_xmom = S_data[IntVars::xmom].array(mfi);
         const Array4<Real>& cur_ymom = S_data[IntVars::ymom].array(mfi);
diff --git a/Source/TimeIntegration/ERF_fast_rhs_T.cpp b/Source/TimeIntegration/ERF_fast_rhs_T.cpp
index a26289176..ed97b8469 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_T.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_T.cpp
@@ -6,39 +6,41 @@ using namespace amrex;
 /**
  * Function for computing the fast RHS with fixed terrain
  *
- * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stage_data solution            at previous RK stage
- * @param[in]    S_stage_prim primitive variables at previous RK stage
- * @param[in]    pi_stage     Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity magnitude of gravity
- * @param[in]    Omega component of the momentum normal to the z-coordinate surface
- * @param[in]    z_phys_nd height coordinate at nodes
- * @param[in]    detJ_cc Jacobian of the metric transformation
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
+ * @param[in   ] step  which fast time step within each Runge-Kutta step
+ * @param[in   ] nrk   which Runge-Kutta step
+ * @param[in   ] level level of resolution
+ * @param[in   ] finest_level finest level of resolution
+ * @param[in   ] S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ] S_prev previous solution
+ * @param[in   ] S_stage_data solution            at previous RK stage
+ * @param[in   ] S_stage_prim primitive variables at previous RK stage
+ * @param[in   ] pi_stage     Exner function      at previous RK stage
+ * @param[in   ] fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out] S_data current solution
+ * @param[in   ] S_scratch scratch space
+ * @param[in   ] geom container for geometric information
+ * @param[in   ] gravity magnitude of gravity
+ * @param[in   ] Omega component of the momentum normal to the z-coordinate surface
+ * @param[in   ] z_phys_nd height coordinate at nodes
+ * @param[in   ] detJ_cc Jacobian of the metric transformation
+ * @param[in   ] dtau fast time step
+ * @param[in   ] beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ] facinv inverse factor for time-averaging the momenta
+ * @param[in   ] mapfac_m map factor at cell centers
+ * @param[in   ] mapfac_u map factor at x-faces
+ * @param[in   ] mapfac_v map factor at y-faces
  * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
  * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ] l_use_moisture
+ * @param[in   ] l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ] l_implicit_substepping
  */
 
 void erf_fast_rhs_T (int step, int nrk,
                      int level, int finest_level,
                      Vector<MultiFab>& S_slow_rhs,                   // the slow RHS already computed
                      const Vector<MultiFab>& S_prev,                 // if step == 0, this is S_old, else the previous solution
-                     Vector<MultiFab>& S_stage_data,                 // S_bar = S^n, S^* or S^**
+                     Vector<MultiFab>& S_stage_data,                 // S_stage = S^n, S^* or S^**
                      const MultiFab& S_stage_prim,                   // Primitive version of S_stage_data[IntVars::cons]
                      const MultiFab& pi_stage,                       // Exner function evaluated at last stage
                      const MultiFab& fast_coeffs,                    // Coeffs for tridiagonal solve
@@ -57,7 +59,8 @@ void erf_fast_rhs_T (int step, int nrk,
                      YAFluxRegister* fr_as_crse,
                      YAFluxRegister* fr_as_fine,
                      bool l_use_moisture,
-                     bool l_reflux)
+                     bool l_reflux,
+                     bool /*l_implicit_substepping*/)
 {
     BL_PROFILE_REGION("erf_fast_rhs_T()");