diff --git a/.github/workflows/c-linter.yml b/.github/workflows/c-linter.yml
index 2a7d447b7..685a4120b 100644
--- a/.github/workflows/c-linter.yml
+++ b/.github/workflows/c-linter.yml
@@ -3,7 +3,7 @@ name: cpp-linter
 on: [pull_request]
 jobs:
   cpp-linter:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/cleanup-cache-postpr.yml b/.github/workflows/cleanup-cache-postpr.yml
index 5e9a70cd5..8ae436be8 100644
--- a/.github/workflows/cleanup-cache-postpr.yml
+++ b/.github/workflows/cleanup-cache-postpr.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   CleanUpCcacheCachePostPR:
     name: Clean Up Ccache Cache Post PR
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     permissions:
       actions: write
       contents: read
diff --git a/.github/workflows/cleanup-cache.yml b/.github/workflows/cleanup-cache.yml
index 8779271c2..99592dc78 100644
--- a/.github/workflows/cleanup-cache.yml
+++ b/.github/workflows/cleanup-cache.yml
@@ -9,7 +9,7 @@ on:
 jobs:
   CleanUpCcacheCache:
     name: Clean Up Ccache Cache for ${{ github.event.workflow_run.name }}
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     permissions:
       actions: write
       contents: read
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index bec24c5c7..3d2b5d5a5 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -8,7 +8,7 @@ concurrency:
 
 jobs:
   codespell:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 0ee3edc8b..33f594a2b 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -8,7 +8,7 @@ concurrency:
 
 jobs:
   build-and-deploy:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml
index 3504eab14..e69bc4927 100644
--- a/.github/workflows/draft-pdf.yml
+++ b/.github/workflows/draft-pdf.yml
@@ -2,7 +2,7 @@ on: [push]
 
 jobs:
   paper:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     name: Paper Draft
     steps:
       - name: Checkout
diff --git a/.github/workflows/post-pr.yml b/.github/workflows/post-pr.yml
index 5f0b15349..23b06df6f 100644
--- a/.github/workflows/post-pr.yml
+++ b/.github/workflows/post-pr.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   noop:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - name: No OP
         run: echo "This workflow is going to trigger CleanUpCachePostPR."
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
index 5b5af5e8d..5756fdb27 100644
--- a/.github/workflows/style.yml
+++ b/.github/workflows/style.yml
@@ -8,14 +8,14 @@ concurrency:
 
 jobs:
   tabs:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
       - name: Tabs
         run: .github/workflows/style/check_tabs.sh
 
   trailing_whitespaces:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v4
       - name: Trailing Whitespaces
diff --git a/.github/workflows/sycl.yml b/.github/workflows/sycl.yml
index 471978d8c..f7ad7b22d 100644
--- a/.github/workflows/sycl.yml
+++ b/.github/workflows/sycl.yml
@@ -18,7 +18,7 @@ concurrency:
 jobs:
   Build-And-Test-SYCL:
     name: oneAPI SYCL
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     steps:
     - uses: actions/checkout@v4
       with:
diff --git a/.gitlab/LC/.gitlab-ci.yml b/.gitlab/LC/.gitlab-ci.yml
new file mode 100644
index 000000000..c42d6f952
--- /dev/null
+++ b/.gitlab/LC/.gitlab-ci.yml
@@ -0,0 +1,95 @@
+variables:
+  CUSTOM_CI_BUILDS_DIR: "/usr/workspace/$$USER/erf_gitlab_runner"
+
+  GIT_STRATEGY: fetch
+  GIT_SUBMODULE_STRATEGY: recursive
+  GIT_DEPTH: 1
+  GIT_SUBMODULE_DEPTH: 1
+
+  DEFAULT_BRANCH: llnl/development
+
+  ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID}
+  ALLOC_QUEUE: pci
+  ALLOC_TIME: 30
+  ALLOC_BANK: accatm
+
+  TEST_SCRIPT: .gitlab/LC/gitlab_test.sh
+
+  # Uncomment to disable testing on particular system
+  #ON_LASSEN: "OFF"
+  #ON_DANE: "OFF"
+  #ON_TIOGA: "OFF"
+
+stages:
+  - style
+  - allocate
+  - build
+  - release
+
+workflow:
+  rules:
+    # skip running branch pipelines if a MR is open for the branch
+    - if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS && $CI_PIPELINE_SOURCE == "push"
+      when: never
+    - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
+    - if: $CI_PIPELINE_SOURCE == 'web'
+    - if: $CI_COMMIT_TAG
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
+    # test the upstream branch
+    - if: $CI_COMMIT_BRANCH == 'development'
+    # branches starting with "gitlab"
+    - if: $CI_COMMIT_BRANCH =~ /^gitlab.*/
+
+include:
+  # This include is required for LC with Gitlab 17+
+  # Refer to https://hpc.llnl.gov/technical-bulletins/bulletin-568
+  - project: 'lc-templates/id_tokens'
+    file: 'id_tokens.yml'
+  - .gitlab/LC/runners/lassen.yml
+  - .gitlab/LC/runners/dane.yml
+  - .gitlab/LC/runners/tioga.yml
+
+# Define actual CI jobs here:
+check_style:
+  extends: .on_dane
+  stage: style
+  rules:
+    # always run the style check on any push event
+    - if: $CI_PIPELINE_SOURCE == "push"
+    - when: on_success
+  script:
+    - echo "Running check_tabs.sh"
+    - .github/workflows/style/check_tabs.sh
+    - echo "Running check_trailing_whitespaces.sh"
+    - .github/workflows/style/check_trailing_whitespaces.sh
+
+dane_gcc_12_1_1:
+  variables:
+    MODULE_LIST: cmake gcc/12.1.1
+  extends: .job_on_dane
+
+lassen_gcc_12_2_1:
+  variables:
+    MODULE_LIST: cmake/3.23.1 gcc/12.2.1
+  extends: .job_on_lassen
+
+lassen_gcc_12_2_1_cuda:
+  variables:
+    MODULE_LIST: cmake/3.23.1 gcc/12.2.1 cuda/12.2.2
+    ERF_ENABLE_CUDA: "ON"
+    # NOTE: c++ and cc are used here over mpicxx/mpicc due to cmake issue finding mpi with cuda?
+    CMAKE_CXX_COMPILER: c++
+    CMAKE_C_COMPILER: cc
+    CUDA_ARCH: "70"
+    ERF_TEST_FCOMPARE_RTOL: "1.0e-8"
+    ERF_TEST_FCOMPARE_ATOL: "1.0e-9"
+  extends: .job_on_lassen
+
+tioga_hip_5.7.1:
+  variables:
+    MODULE_LIST: cmake/3.24.2 rocm/6.1.2 rocmcc/6.1.2-cce-18.0.0-magic craype-accel-amd-gfx90a
+    ERF_ENABLE_HIP: "ON"
+    AMD_ARCH: "gfx90a"
+    # NOTE: Running with Debug build type causes AMD linking errors with AMReX plotfiles=ON
+    BUILD_TYPE: "RelWithDebInfo"
+  extends: .job_on_tioga
diff --git a/.gitlab/LC/gitlab_test.sh b/.gitlab/LC/gitlab_test.sh
new file mode 100755
index 000000000..9ca01fbc1
--- /dev/null
+++ b/.gitlab/LC/gitlab_test.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+modules=${MODULE_LIST:-""}
+mpiexec_executable=${MPIEXEC_EXECUTABLE:-"srun"}
+# If using flux, append "run" after the flux executable path
+if [[ "${mpiexec_executable}" == "flux" ]]
+then
+    mpiexec_executable="$(which ${mpiexec_executable}) run"
+    flux jobs
+    flux resource list
+else
+    mpiexec_executable="$(which ${mpiexec_executable})"
+fi
+
+mpiexec_preflags=${MPIEXEC_PREFLAGS:-""}
+host=$(hostname)
+build_type=${BUILD_TYPE:-"Debug"}
+
+ERF_ENABLE_CUDA=${ERF_ENABLE_CUDA:-"OFF"}
+
+basehost=${host//[[:digit:]]/}
+
+echo ${host}
+
+build_dir=build_${host}_${CI_PIPELINE_ID}_$(date +%F_%H_%M_%S)
+
+if [[ -n ${modules} ]]
+then
+    module load ${modules}
+fi
+
+# Temporary workaround for CUDA builds:
+#  AMReX fcompare seems to not work as expected if compiled with CUDA.
+#  This builds a CPU version first and uses that fcompare executable during the
+#  testing for the CUDA build
+if [[ "${ERF_ENABLE_CUDA}" == "ON" ]]
+then
+    echo "====================================================="
+    echo "Building CPU version first to get fcompare executable"
+    echo "====================================================="
+    mkdir "${build_dir}_cpu"
+    cd "${build_dir}_cpu"
+    pwd
+
+    cmake -DCMAKE_INSTALL_PREFIX:PATH=./install \
+          -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER:-"mpicxx"} \
+          -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER:-"mpicc"} \
+          -DCMAKE_Fortran_COMPILER:STRING=${CMAKE_Fortran_COMPILER:-"mpifort"} \
+          -DCMAKE_BUILD_TYPE:STRING=Release \
+          -DERF_DIM:STRING=3 \
+          -DERF_ENABLE_MPI:BOOL=ON \
+          -DERF_ENABLE_CUDA:BOOL=OFF \
+          -DERF_ENABLE_TESTS:BOOL=OFF \
+          -DERF_ENABLE_FCOMPARE:BOOL=ON \
+          -DERF_ENABLE_DOCUMENTATION:BOOL=OFF \
+          -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON ..
+    make -j fcompare
+
+    FCOMPARE_EXE="$(pwd)/Submodules/AMReX/Tools/Plotfile/amrex_fcompare"
+
+    cd ../
+
+    echo "====================================================="
+    echo "Using fcompare executable at: ${FCOMPARE_EXE}"
+    echo "====================================================="
+fi
+
+mkdir ${build_dir}
+cd ${build_dir}
+pwd
+
+cmake -DCMAKE_INSTALL_PREFIX:PATH=./install \
+      -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER:-"mpicxx"} \
+      -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER:-"mpicc"} \
+      -DCMAKE_Fortran_COMPILER:STRING=${CMAKE_Fortran_COMPILER:-"mpifort"} \
+      -DMPIEXEC_EXECUTABLE="${mpiexec_executable}" \
+      -DMPIEXEC_PREFLAGS:STRING="${mpiexec_preflags}" \
+      -DCMAKE_BUILD_TYPE:STRING="${build_type}" \
+      -DERF_DIM:STRING=3 \
+      -DERF_ENABLE_MPI:BOOL=ON \
+      -DERF_ENABLE_CUDA:BOOL="${ERF_ENABLE_CUDA}" \
+      -DAMReX_CUDA_ARCH:STRING="${CUDA_ARCH:-""}" \
+      -DERF_ENABLE_HIP:BOOL="${ERF_ENABLE_HIP:-"OFF"}" \
+      -DAMReX_AMD_ARCH:STRING="${AMD_ARCH:-""}" \
+      -DERF_ENABLE_TESTS:BOOL=ON \
+      -DERF_TEST_NRANKS:STRING=${ERF_TEST_NRANKS:-"4"} \
+      -DERF_ENABLE_FCOMPARE:BOOL=ON \
+      -DERF_ENABLE_DOCUMENTATION:BOOL=OFF \
+      -DFCOMPARE_EXE="${FCOMPARE_EXE:-"$(pwd)/Submodules/AMReX/Tools/Plotfile/amrex_fcompare"}" \
+      -DERF_TEST_FCOMPARE_RTOL="${ERF_TEST_FCOMPARE_RTOL:-"5.0e-9"}" \
+      -DERF_TEST_FCOMPARE_ATOL="${ERF_TEST_FCOMPARE_ATOL:-"2.0e-10"}" \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=ON \
+      ..
+make -j ${OMP_NUM_THREADS:-16}
+ctest -VV --output-on-failure
diff --git a/.gitlab/LC/runners/dane.yml b/.gitlab/LC/runners/dane.yml
new file mode 100644
index 000000000..5dfa29f33
--- /dev/null
+++ b/.gitlab/LC/runners/dane.yml
@@ -0,0 +1,59 @@
+.retry:
+  retry:
+    max: 2
+    when:
+      - runner_system_failure
+
+.on_dane:
+  extends:
+    - .retry
+  tags:
+    - dane
+    - shell
+  rules:
+    - if: '$ON_DANE == "OFF"'
+      when: never
+    # test the upstream branch
+    - if: $CI_COMMIT_BRANCH == 'development'
+    # branches starting with "gitlab"
+    - if: $CI_COMMIT_BRANCH =~ /^gitlab.*/
+    - if: $CI_PIPELINE_SOURCE == "push"
+      when: never
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $DEFAULT_BRANCH
+    - if: '$CI_JOB_NAME =~ /release_resources_dane/'
+      when: always
+    - when: on_success
+
+allocate_resources_dane:
+  variables:
+    GIT_STRATEGY: none
+  extends:
+    - .on_dane
+  stage: allocate
+  script:
+    - salloc -N 1 --reservation=ci -A ${ALLOC_BANK} --time=${ALLOC_TIME} --no-shell --job-name=${ALLOC_NAME}
+
+release_resources_dane:
+  variables:
+    GIT_STRATEGY: none
+  extends:
+    - .on_dane
+  stage: release
+  script:
+    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
+    - ([[ -n "${JOBID}" ]] && scancel ${JOBID})
+  when: always
+
+.job_on_dane:
+  extends: .on_dane
+  stage: build
+  needs: ["allocate_resources_dane"]
+  variables:
+    MPIEXEC_EXECUTABLE: srun
+    MPIEXEC_PREFLAGS: "--cpu-bind=cores -v"
+  script:
+    - echo "JOB NAME ${ALLOC_NAME}"
+    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
+    - echo "SLURM ID ${JOBID}"
+    - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -N 1 -t ${ALLOC_TIME} -v --overlap ${TEST_SCRIPT}
diff --git a/.gitlab/LC/runners/lassen.yml b/.gitlab/LC/runners/lassen.yml
new file mode 100644
index 000000000..bdefb3f85
--- /dev/null
+++ b/.gitlab/LC/runners/lassen.yml
@@ -0,0 +1,34 @@
+.retry:
+  retry:
+    max: 2
+    when:
+      - runner_system_failure
+
+.on_lassen:
+  extends:
+    - .retry
+  tags:
+    - lassen
+    - shell
+  rules:
+    - if: '$ON_LASSEN == "OFF"'
+      when: never
+    # test the upstream branch
+    - if: $CI_COMMIT_BRANCH == 'development'
+    # branches starting with "gitlab"
+    - if: $CI_COMMIT_BRANCH =~ /^gitlab.*/
+    - if: $CI_PIPELINE_SOURCE == "push"
+      when: never
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $DEFAULT_BRANCH
+    - when: on_success
+
+.job_on_lassen:
+  extends: .on_lassen
+  stage: build
+  needs: []
+  variables:
+    MPIEXEC_EXECUTABLE: jsrun
+    MPIEXEC_PREFLAGS: "-a 1 -c 1 -g 1"
+  script:
+    - bsub -q ${ALLOC_QUEUE} -W ${ALLOC_TIME} -G ${ALLOC_BANK} -J ${ALLOC_NAME} -nnodes 1 -Is ${TEST_SCRIPT}
diff --git a/.gitlab/LC/runners/tioga.yml b/.gitlab/LC/runners/tioga.yml
new file mode 100644
index 000000000..eecf7f9a3
--- /dev/null
+++ b/.gitlab/LC/runners/tioga.yml
@@ -0,0 +1,60 @@
+.retry:
+  retry:
+    max: 2
+    when:
+      - runner_system_failure
+
+.on_tioga:
+  extends:
+    - .retry
+  tags:
+    - tioga
+    - shell
+  rules:
+    - if: '$ON_TIOGA == "OFF"'
+      when: never
+    # test the upstream branch
+    - if: $CI_COMMIT_BRANCH == 'development'
+    # branches starting with "gitlab"
+    - if: $CI_COMMIT_BRANCH =~ /^gitlab.*/
+    - if: $CI_PIPELINE_SOURCE == "push"
+      when: never
+    - if: $CI_PIPELINE_SOURCE == "merge_request_event"
+    - if: $CI_COMMIT_BRANCH == $DEFAULT_BRANCH
+    - if: '$CI_JOB_NAME =~ /release_resources_tioga/'
+      when: always
+    - when: on_success
+
+allocate_resources_tioga:
+  variables:
+    GIT_STRATEGY: none
+  extends:
+    - .on_tioga
+  stage: allocate
+  script:
+    - flux alloc -N 1 -q ${ALLOC_QUEUE} -t=${ALLOC_TIME} --bg --exclusive --job-name=${ALLOC_NAME}
+
+release_resources_tioga:
+  variables:
+    GIT_STRATEGY: none
+  extends:
+    - .on_tioga
+  stage: release
+  script:
+    - export JOBID=$(flux jobs -n --name=${ALLOC_NAME} --format="{id}")
+    - ([[ -n "${JOBID}" ]] && flux cancel ${JOBID})
+  when: always
+
+.job_on_tioga:
+  extends: .on_tioga
+  stage: build
+  needs: ["allocate_resources_tioga"]
+  variables:
+    # Note: "flux" gets expanded to "flux run" inside build script
+    MPIEXEC_EXECUTABLE: flux
+    MPIEXEC_PREFLAGS: "-c 1 -g 1 -o mpi-spectrum -o cpu-affinity=per-task -o gpu-affinity=per-task -vv"
+  script:
+    - echo "JOB NAME ${ALLOC_NAME}"
+    - export JOBID=$(flux jobs -n --name=${ALLOC_NAME} --format="{id}")
+    - echo "FLUX ID ${JOBID}"
+    - flux proxy $( [[ -n "${JOBID}" ]] && echo "${JOBID}" ) flux run -N 1 -n 1 -c 16 -vv ${TEST_SCRIPT}
diff --git a/.gitmodules b/.gitmodules
index f96430215..ba9b4da82 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,7 +9,11 @@
 [submodule "Submodules/RRTMGP"]
 	path = Submodules/RRTMGP
 	url = https://github.com/E3SM-Project/rte-rrtmgp
-	shallow = true
+        shallow = true
+[submodule "Submodules/NOAH-MP"]
+	path = Submodules/NOAH-MP
+        url = https://github.com/AIEADA/noahmp.git
+        shallow=true
 [submodule "Submodules/WW3"]
 	path = Submodules/WW3
 	url = https://github.com/erf-model/WW3
diff --git a/Build/cmake_with_poisson.sh b/Build/cmake_with_poisson.sh
index ab455182f..a883fcb7f 100755
--- a/Build/cmake_with_poisson.sh
+++ b/Build/cmake_with_poisson.sh
@@ -8,7 +8,6 @@ cmake -DCMAKE_INSTALL_PREFIX:PATH=./install \
       -DCMAKE_Fortran_COMPILER:STRING=mpifort \
       -DMPIEXEC_PREFLAGS:STRING=--oversubscribe \
       -DCMAKE_BUILD_TYPE:STRING=Release \
-      -DAMREX_LINEAR_SOLVERS:BOOL=ON \
       -DERF_DIM:STRING=3 \
       -DERF_ENABLE_MPI:BOOL=ON \
       -DERF_ENABLE_TESTS:BOOL=ON \
diff --git a/CMake/BuildERFExe.cmake b/CMake/BuildERFExe.cmake
index dc9c4208c..6c5d6195d 100644
--- a/CMake/BuildERFExe.cmake
+++ b/CMake/BuildERFExe.cmake
@@ -18,15 +18,8 @@ function(build_erf_lib erf_lib_name)
   target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_MOISTURE)
 
   if(ERF_ENABLE_MULTIBLOCK)
-    target_sources(${erf_lib_name} PRIVATE
-                   ${SRC_DIR}/MultiBlock/ERF_MultiBlockContainer.cpp)
     target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_MULTIBLOCK)
-#    if(NOT ERF_MB_EXTERN)
-      target_sources(${erf_lib_name} PRIVATE
-                     ${SRC_DIR}/MultiBlock/ERF_MultiBlockContainer.cpp)
-      target_include_directories(${erf_lib_name} PRIVATE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/MultiBlock>)
-#    endif()
-   endif()
+  endif()
 
   if(ERF_ENABLE_WARM_NO_PRECIP)
     target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_WARM_NO_PRECIP)
@@ -34,7 +27,6 @@ function(build_erf_lib erf_lib_name)
 
   if(ERF_ENABLE_POISSON_SOLVE)
     target_sources(${erf_lib_name} PRIVATE
-                   ${SRC_DIR}/TimeIntegration/ERF_slow_rhs_inc.cpp
                    ${SRC_DIR}/Utils/ERF_PoissonSolve.cpp
                    ${SRC_DIR}/Utils/ERF_PoissonSolve_tb.cpp)
     target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_POISSON_SOLVE)
@@ -75,6 +67,16 @@ function(build_erf_lib erf_lib_name)
     target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_NETCDF)
   endif()
 
+  if(ERF_ENABLE_NOAH)
+    target_include_directories(${erf_lib_name} PUBLIC
+                               $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/LandSurfaceModel/NOAH>
+                               $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Submodules/NOAH-MP/drivers/hrldas>)
+    target_sources(${erf_lib_name} PRIVATE
+                   ${SRC_DIR}/LandSurfaceModel/NOAH/ERF_NOAH.cpp)
+    target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_NOAH)
+    target_link_libraries_system(${erf_lib_name} PUBLIC NoahMP::noahmp)
+  endif()
+
   if(ERF_ENABLE_RRTMGP)
     target_sources(${erf_lib_name} PRIVATE
                    ${SRC_DIR}/Utils/ERF_Orbit.cpp
@@ -91,10 +93,10 @@ function(build_erf_lib erf_lib_name)
                    ${CMAKE_SOURCE_DIR}/Submodules/RRTMGP/cpp/extensions/fluxes_byband/mo_fluxes_byband_kernels.cpp
                   )
 
-    # The interface code needs to know about the RRTMGP includes 
+    # The interface code needs to know about the RRTMGP includes
     target_compile_definitions(${erf_lib_name} PUBLIC ERF_USE_RRTMGP)
 
-    target_include_directories(${erf_lib_name} SYSTEM PUBLIC 
+    target_include_directories(${erf_lib_name} SYSTEM PUBLIC
                                ${CMAKE_SOURCE_DIR}/Submodules/RRTMGP/cpp/extensions/fluxes_byband
                                ${CMAKE_SOURCE_DIR}/Submodules/RRTMGP/cpp/extensions/cloud_optics
                                ${CMAKE_SOURCE_DIR}/Submodules/RRTMGP/cpp/examples
@@ -126,6 +128,9 @@ function(build_erf_lib erf_lib_name)
        ${SRC_DIR}/BoundaryConditions/ERF_BoundaryConditions_bndryreg.cpp
        ${SRC_DIR}/BoundaryConditions/ERF_BoundaryConditions_realbdy.cpp
        ${SRC_DIR}/BoundaryConditions/ERF_FillPatch.cpp
+       ${SRC_DIR}/BoundaryConditions/ERF_FillCoarsePatch.cpp
+       ${SRC_DIR}/BoundaryConditions/ERF_FillIntermediatePatch.cpp
+       ${SRC_DIR}/BoundaryConditions/ERF_FillBdyCCVels.cpp
        ${SRC_DIR}/BoundaryConditions/ERF_FillPatcher.cpp
        ${SRC_DIR}/BoundaryConditions/ERF_PhysBCFunct.cpp
        ${SRC_DIR}/Diffusion/ERF_DiffusionSrcForMom_N.cpp
@@ -161,7 +166,7 @@ function(build_erf_lib erf_lib_name)
        ${SRC_DIR}/PBL/ERF_ComputeDiffusivityMYNN25.cpp
        ${SRC_DIR}/PBL/ERF_ComputeDiffusivityYSU.cpp
        ${SRC_DIR}/SourceTerms/ERF_ApplySpongeZoneBCs.cpp
-       ${SRC_DIR}/SourceTerms/ERF_ApplySpongeZoneBCs_ReadFromFile.cpp	  
+       ${SRC_DIR}/SourceTerms/ERF_ApplySpongeZoneBCs_ReadFromFile.cpp
        ${SRC_DIR}/SourceTerms/ERF_make_buoyancy.cpp
        ${SRC_DIR}/SourceTerms/ERF_add_thin_body_sources.cpp
        ${SRC_DIR}/SourceTerms/ERF_make_mom_sources.cpp
@@ -205,13 +210,6 @@ function(build_erf_lib erf_lib_name)
        ${SRC_DIR}/LandSurfaceModel/MM5/ERF_MM5.cpp
   )
 
-  if(NOT "${erf_exe_name}" STREQUAL "erf_unit_tests")
-    target_sources(${erf_lib_name}
-       PRIVATE
-         ${SRC_DIR}/main.cpp
-    )
-  endif()
-
   include(AMReXBuildInfo)
   generate_buildinfo(${erf_lib_name} ${CMAKE_SOURCE_DIR})
 if (${ERF_USE_INTERNAL_AMREX})
@@ -250,7 +248,7 @@ endif()
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/Microphysics>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/Microphysics/Null>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/Microphysics/SAM>)
-  target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/Microphysics/Kessler>) 
+  target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/Microphysics/Kessler>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/WindFarmParametrization>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/WindFarmParametrization/Null>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/WindFarmParametrization/Fitch>)
@@ -261,7 +259,7 @@ endif()
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/LandSurfaceModel/Null>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/LandSurfaceModel/SLM>)
   target_include_directories(${erf_lib_name} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/Source/LandSurfaceModel/MM5>)
-    
+
   if(ERF_ENABLE_RRTMGP)
      target_link_libraries(${erf_lib_name} PUBLIC yakl)
      target_link_libraries(${erf_lib_name} PUBLIC rrtmgp)
@@ -296,6 +294,13 @@ function(build_erf_exe erf_exe_name)
 
   set(SRC_DIR ${CMAKE_SOURCE_DIR}/Source)
 
+  if(NOT "${erf_exe_name}" STREQUAL "erf_unit_tests")
+  target_sources(${erf_exe_name}
+     PRIVATE
+       ${SRC_DIR}/main.cpp
+  )
+  endif()
+
   target_link_libraries(${erf_exe_name}  PUBLIC ${erf_lib_name})
   include(${CMAKE_SOURCE_DIR}/CMake/SetERFCompileFlags.cmake)
   set_erf_compile_flags(${erf_exe_name})
diff --git a/CMake/FindNetCDF.cmake b/CMake/FindNetCDF.cmake
index 3e54ea3e8..6fac102b3 100644
--- a/CMake/FindNetCDF.cmake
+++ b/CMake/FindNetCDF.cmake
@@ -23,10 +23,44 @@ endif (NETCDF_INCLUDES AND NETCDF_LIBRARIES)
 find_package(PkgConfig REQUIRED QUIET)
 pkg_check_modules(NETCDF REQUIRED IMPORTED_TARGET netcdf)
 
+find_path(NETCDF_INCLUDES netcdf.h
+    HINTS NETCDF_DIR/include ENV NETCDF_DIR)
+
+find_library(NETCDF_LIBRARIES_C NAMES netcdf HINTS NETCDF_DIR/lib ENV NETCDF_DIR)
+mark_as_advanced(NETCDF_LIBRARIES_C)
+
+set(NetCDF_has_interfaces "YES") # will be set to NO if we're missing any interfaces
+set(NetCDF_libs "${NETCDF_LIBRARIES_C}")
+
+get_filename_component(NetCDF_lib_dirs "${NETCDF_LIBRARIES_C}" PATH)
+
+macro(NetCDF_check_interface lang header libs)
+    if(NETCDF_${lang})
+        find_path(NETCDF_INCLUDES_${lang} NAMES ${header}
+            HINTS "${NETCDF_INCLUDES}" NO_DEFAULT_PATH)
+        find_library(NETCDF_LIBRARIES_${lang} NAMES ${libs}
+            HINTS "${NetCDF_lib_dirs}" NO_DEFAULT_PATH)
+        mark_as_advanced(NETCDF_INCLUDES_${lang} NETCDF_LIBRARIES_${lang})
+
+        if(NETCDF_INCLUDES_${lang} AND NETCDF_LIBRARIES_${lang})
+            list(INSERT NetCDF_libs 0 ${NETCDF_LIBRARIES_${lang}}) # prepend so that -lnetcdf is last
+        else(NETCDF_INCLUDES_${lang} AND NETCDF_LIBRARIES_${lang})
+            set(NetCDF_has_interfaces "NO")
+            message(STATUS "Failed to find NetCDF interface for ${lang}")
+        endif(NETCDF_INCLUDES_${lang} AND NETCDF_LIBRARIES_${lang})
+    endif(NETCDF_${lang})
+endmacro(NetCDF_check_interface)
+
+NetCDF_check_interface(CXX netcdfcpp.h netcdf_c++)
+NetCDF_check_interface(F77 netcdf.inc netcdff)
+NetCDF_check_interface(F90 netcdf.mod netcdff)
+
+set(NETCDF_LIBRARIES "${NetCDF_libs}" CACHE STRING "All NetCDF libraries required for interface level")
+
 # handle the QUIETLY and REQUIRED arguments and set NETCDF_FOUND to TRUE if
 # all listed variables are TRUE
 include (FindPackageHandleStandardArgs)
-find_package_handle_standard_args (NetCDF DEFAULT_MSG NETCDF_LIBRARIES NETCDF_LINK_LIBRARIES NETCDF_INCLUDE_DIRS)
+find_package_handle_standard_args (NetCDF DEFAULT_MSG NETCDF_LIBRARIES NETCDF_LINK_LIBRARIES NETCDF_INCLUDE_DIRS NETCDF_INCLUDES NetCDF_has_interfaces)
 
 mark_as_advanced (NETCDF_LIBRARIES NETCDF_INCLUDES)
 
diff --git a/CMake/SetAmrexOptions.cmake b/CMake/SetAmrexOptions.cmake
index c902bef72..9c43e753a 100644
--- a/CMake/SetAmrexOptions.cmake
+++ b/CMake/SetAmrexOptions.cmake
@@ -30,6 +30,8 @@ set(AMReX_FORTRAN OFF)
 set(AMReX_LINEAR_SOLVERS OFF)
 if(ERF_ENABLE_POISSON_SOLVE)
   set(AMReX_LINEAR_SOLVERS ON)
+  set(AMReX_LINEAR_SOLVERS_EM OFF)
+  set(AMReX_LINEAR_SOLVERS_INCFLO OFF)
 endif()
 
 set(AMReX_PARTICLES OFF)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e59c3c5b..e045d2eb6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,9 @@ option(ERF_ENABLE_CUDA "Enable CUDA" OFF)
 option(ERF_ENABLE_HIP  "Enable HIP" OFF)
 option(ERF_ENABLE_SYCL "Enable SYCL" OFF)
 
+#Options for NOAH-MP
+option(ERF_ENABLE_NOAH "Enable Noah-MP" OFF)
+
 #Options for C++
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_EXTENSIONS OFF)
@@ -78,8 +81,13 @@ if (${ERF_USE_INTERNAL_AMREX})
 ########################### AMReX #####################################
 
     add_subdirectory(${AMREX_SUBMOD_LOCATION})
-    set(FCOMPARE_EXE ${CMAKE_BINARY_DIR}/Submodules/AMReX/Tools/Plotfile/amrex_fcompare
-      CACHE INTERNAL "Path to fcompare executable for regression tests")
+    if(WIN32)
+      set(FCOMPARE_EXE ${CMAKE_BINARY_DIR}/Submodules/AMReX/Tools/Plotfile/*/amrex_fcompare.exe
+        CACHE STRING "Path to fcompare executable for regression tests")
+    else()
+      set(FCOMPARE_EXE ${CMAKE_BINARY_DIR}/Submodules/AMReX/Tools/Plotfile/amrex_fcompare
+        CACHE STRING "Path to fcompare executable for regression tests")
+    endif()
 else()
     set(CMAKE_PREFIX_PATH ${AMREX_DIR} ${CMAKE_PREFIX_PATH})
     list(APPEND AMREX_COMPONENTS
@@ -109,8 +117,13 @@ else()
     find_package(AMReX CONFIG REQUIRED
       COMPONENTS ${AMREX_COMPONENTS})
     message(STATUS "Found AMReX = ${AMReX_DIR}")
-    set(FCOMPARE_EXE ${AMReX_DIR}/../../../bin/amrex_fcompare
-      CACHE INTERNAL "Path to fcompare executable for regression tests")
+    if(WIN32)
+      set(FCOMPARE_EXE ${AMReX_DIR}/../../../*/amrex_fcompare.exe
+        CACHE STRING "Path to fcompare executable for regression tests")
+    else()
+      set(FCOMPARE_EXE ${AMReX_DIR}/../../../bin/amrex_fcompare
+        CACHE STRING "Path to fcompare executable for regression tests")
+    endif()
 endif()
 
 ########################## NETCDF ##################################
@@ -124,6 +137,18 @@ if(ERF_ENABLE_NETCDF)
   endif()
 endif()
 
+########################## NOAH-MP ##################################
+
+if(ERF_ENABLE_NOAH)
+  if(ERF_ENABLE_NETCDF)
+     set(NOAHMP_HOME ${CMAKE_SOURCE_DIR}/Submodules/NOAH-MP)
+     set(NOAHMP_BIN  ${CMAKE_BINARY_DIR}/Submodules/NOAH-MP)
+     add_subdirectory(${NOAHMP_HOME} ${NOAHMP_BIN})
+  else()
+     message(FATAL_ERROR "Noah-MP requires NetCDF be enabled")
+  endif()
+endif()
+
 ########################### RRTMGP #################################
 
 if(ERF_ENABLE_RRTMGP)
diff --git a/Docs/sphinx_doc/Inputs.rst b/Docs/sphinx_doc/Inputs.rst
index ec0775520..2f78a91fe 100644
--- a/Docs/sphinx_doc/Inputs.rst
+++ b/Docs/sphinx_doc/Inputs.rst
@@ -428,9 +428,8 @@ List of Parameters
 |                            | cfl or other         |                |                   |
 |                            | settings             |                |                   |
 +----------------------------+----------------------+----------------+-------------------+
-| **erf.fixed_fast_dt**      | set fast dt          | Real > 0       | only relevant     |
-|                            | as this value        |                | if use_native_mri |
-|                            |                      |                | is true           |
+| **erf.fixed_fast_dt**      | set fast dt          | Real > 0       |                   |
+|                            | as this value        |                |                   |
 +----------------------------+----------------------+----------------+-------------------+
 | **erf.fixed_mri_dt_ratio** | set fast dt          | even int > 0   | only relevant     |
 |                            | as slow dt /         |                | if no_substepping |
@@ -719,6 +718,74 @@ The requested output files have the following columns:
   #. SGS turbulence dissipation, :math:`\epsilon` (m2/s3)
 
 
+Data Sampling Outputs
+==================
+
+Data along query lines or planes may be output during the simulation if
+``erf.do_line_sampling = true`` or  ``erf.do_plane_sampling = true``, respectively.
+The potential temperature and wind-speed will be written to native ``plt_line/plane``
+at the step frequency dictated by ``erf.sampler_interval = <int>``. For line sampling,
+users must prescribe ``sample_line_lo`` and ``sample_line_hi`` inputs which are 3 integer
+values corresponding to the (i,j,k) indices at the beginning and end of the line.
+Additionally, users must specify ``sample_line_dir`` to prescribed the direction of
+the line. The same inputs are used for the plane sampling except that ``sample_plane_lo/hi``
+must be the physical locations of the plane corners. This output functionality has
+not been implemented for terrain.
+
+.. _list-of-parameters-10b:
+
+
+List of Parameters
+------------------
+
++-------------------------------+------------------+----------------+----------------+
+| Parameter                     | Definition       | Acceptable     | Default        |
+|                               |                  | Values         |                |
++===============================+==================+================+================+
+| **erf.sampler_interval**      | Output           | Integer        | -1             |
+|                               | frequency        |                |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.do_line_sampling**      | Flag to do line  | Boolean        | false          |
+|                               | sampling         |                |                |
+|                               |                  |                |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.do_plane_sampling**     | Flag to do plane | Boolean        | false          |
+|                               | sampling         |                |                |
+|                               |                  |                |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.sample_line_dir**       | Directionality   | Integer        | None           |
+|                               | of the line      |                |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.sample_plane_dir**      | Directionality   | Integer        | None           |
+|                               | of the plane     |                |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.sample_line_lo/hi**     | Bounding (i,j,k) | 3 Integers per | None           |
+|                               | on the line(s)   | line           |                |
++-------------------------------+------------------+----------------+----------------+
+| **erf.sample_plane_lo/hi**    | Bounding point   | 3 Reals per    | None           |
+|                               | on the plane(s)  | plane          |                |
++-------------------------------+------------------+----------------+----------------+
+
+.. _examples-of-usage-10b:
+
+Example of Usage
+-----------------
+
+::
+
+   erf.sampler_interval = 1                  # Write plt files every step
+
+   erf.do_line_sampling = true               # Do line sampling
+   erf.sample_line_lo   = 5 32  5 10   32 5  # Lo points for two lines
+   erf.sample_line_hi   = 5 32 25 1000 32 5  # Hi points for two lines
+   erf.sample_line_dir  = 2 0                # One line in z and one in x
+
+   erf.do_plane_sampling = true              # Do plane sampling
+   erf.sample_plane_lo   =  48.0  48.0  32.0 # Lo points for one plane
+   erf.sample_plane_hi   = 320.0 320.0  32.0 # Hi points for one plane
+   erf.sample_plane_dir  = 2                 # One plane with z normal
+
+
 Advection Schemes
 =================
 
diff --git a/Docs/sphinx_doc/TimeAdvance.rst b/Docs/sphinx_doc/TimeAdvance.rst
index e6d12ef33..872c41409 100644
--- a/Docs/sphinx_doc/TimeAdvance.rst
+++ b/Docs/sphinx_doc/TimeAdvance.rst
@@ -135,8 +135,14 @@ Then the acoustic substepping evolves the equations in the form
           - \frac{\partial (\beta_1 W^{\prime \prime, \tau} + \beta_2 W^{\prime \prime, \tau + \delta \tau})}{\partial z} +  R^t_{\rho}
             \right)
 
-where :math:`\beta_1 = 0.5 (1 - \beta_s)` and :math:`\beta_2 = 0.5 (1 + \beta_s)` with :math:`\beta_s = 0.1`.
-:math:`\beta_s` is the acoustic step off-centering coefficient and 0.1 is the typical WRF value. This off-centering is intended to provide damping of both horizontally and vertically propagating sound waves by biasing the time average toward the future time step.
+where :math:`\beta_1 = 0.5 (1 - \beta_s)` and :math:`\beta_2 = 0.5 (1 + \beta_s)`.
+
+:math:`\beta_s` is the acoustic step off-centering coefficient.  When we do implicit substepping, we use
+the typical WRF value of 0.1. This off-centering is intended to provide damping of both horizontally
+and vertically propagating sound waves by biasing the time average toward the future time step.
+
+When we do fully explicit substepping, we set :math:`\beta_s = -1.0`, which sets
+:math:`\beta_1 = 1` and :math:`\beta_2 = 0`.
 
 To solve the coupled system, we first evolve the equations for :math:`U^{\prime \prime, \tau + \delta \tau}`  and
 :math:`V^{\prime \prime, \tau + \delta \tau}` explicitly using :math:`\Theta^{\prime \prime, \tau}` which is already known.
@@ -149,10 +155,10 @@ to control horizontally propagating sound waves.
 .. math::
 
    p^{\prime\prime,\tau*} = p^{\prime\prime,\tau}
-     + \beta_d \left( p^{\prime\prime,\tau} + p^{\prime\prime,\tau-\delta\tau} \right)
+     + \beta_d \left( p^{\prime\prime,\tau} - p^{\prime\prime,\tau-\delta\tau} \right)
 
 where :math:`\tau*` is the forward projected value used in RHS of the acoustic
 substepping equations for horizontal momentum. According to Skamarock et al,
-This is equivalent to including a horizontal diffusion term in the continuity
+this is equivalent to including a horizontal diffusion term in the continuity
 equation. A typical damping coefficient of :math:`\beta_d = 0.1` is used, as in
 WRF.
diff --git a/Docs/sphinx_doc/building.rst b/Docs/sphinx_doc/building.rst
index eb50a4ba2..1777e9419 100644
--- a/Docs/sphinx_doc/building.rst
+++ b/Docs/sphinx_doc/building.rst
@@ -334,6 +334,99 @@ Finally, you can prepare your SLURM job script, using the following as a guide:
 
 To submit your job script, do ``sbatch [your job script]`` and you can check its status by doing ``squeue -u [your username]``.
 
+AMReX--Kokkos on `Perlmutter`_ (NERSC)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is an `example in amrex-devtests`_ of how one can build `Kokkos`_ and `AMReX`_. This uses ``cmake`` to first compile
+Kokkos and AMReX, and then the example is built. This section describes the build procedure on `Perlmutter`_, though this can be used as
+a template for other machines as well.
+
+Load the following modules, and specify ``MPI_INCLUDE_PATH`` in ``~/.bash_profile``. The ``cmake`` version has to be ``3.24.3``.
+
+.. code-block:: bash
+
+   module load cray-mpich
+   module load PrgEnv-gnu
+   module load cudatoolkit/12.2
+   module load cmake/3.24.3
+
+   export MPI_INCLUDE_PATH=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include
+
+Make sure to do ``source ~/.bash_profile``.
+
+**Kokkos installation on Perlmutter**
+
+To install `Kokkos`_, execute the following commands. In the ``cmake`` command, specify the path where the Kokkos installation should reside
+``-DCMAKE_INSTALL_PREFIX=<path-to-kokkos-install-dir>``. The full path to the ``kokkos`` directory has to be specified in
+``-DCMAKE_CXX_COMPILER=<full-path-to-kokkos-dir>/bin/nvcc_wrapper``.
+
+.. code-block:: bash
+
+   git clone https://github.com/kokkos/kokkos.git
+   cd kokkos
+   mkdir build
+   cd build
+   cmake .. -DCMAKE_INSTALL_PREFIX=<path-to-kokkos-install-dir> -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=17 -DKokkos_ENABLE_CUDA=ON -DCMAKE_CXX_COMPILER=<full-path-to-kokkos-dir>/bin/nvcc_wrapper -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON -DKokkos_ARCH_PASCAL60=ON
+   make -j8
+   make install
+
+**AMReX installation on Perlmutter**
+
+.. note::
+
+   After cloning the repository in the first step below, add the following lines to ``amrex/CMakeLists.txt`` for MPI installation.
+
+   .. code-block:: bash
+
+      # Find MPI
+      find_package(MPI REQUIRED)
+
+      # Include MPI headers
+      include_directories(${MPI_INCLUDE_PATH})
+
+To install `AMReX`_, execute the following commands. In the ``cmake`` command, specify the path where the installation AMReX installation should reside
+``-DCMAKE_INSTALL_PREFIX=<path-to-amrex-install-dir>``.
+
+.. code-block:: bash
+
+   git clone https://github.com/AMReX-Codes/amrex.git
+   cd amrex
+   mkdir build
+   cd build
+   cmake .. -DCMAKE_INSTALL_PREFIX=<path-to-amrex-install-dir> -DAMReX_GPU_BACKEND=CUDA -DAMReX_CUDA_ARCH=60 -DAMReX_MPI=OFF -DCMAKE_PREFIX_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/23.9/math_libs/12.2/lib64 -DAMReX_MPI=ON -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DMPI_INCLUDE_PATH=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include
+   make -j8
+   make install
+
+**Compiling the AMReX-Kokkos example on Perlmutter**
+
+.. note::
+
+   After cloning the repository in the first step below, add the following lines to ``amrex-devtests/kokkos/CMakeLists.txt`` for MPI installation.
+
+   .. code-block:: bash
+
+      # Find MPI
+      find_package(MPI REQUIRED)
+
+      # Include MPI headers
+      include_directories(${MPI_INCLUDE_PATH})
+
+To compile the AMReX-Kokkos example, execute the following commands. In the ``cmake`` command, specify the full path to the amrex installation
+directory ``-DAMReX_ROOT=<full-path-to-amrex-install-dir>``, and the Kokkos installation directory ``-DKokkos_ROOT=<full-path-to-kokkos-install-dir>``.
+
+.. code-block:: bash
+
+   git clone https://github.com/WeiqunZhang/amrex-devtests.git
+   cd amrex-devtests/kokkos
+   mkdir build
+   cd build
+   cmake .. -DENABLE_CUDA=ON -DAMReX_ROOT=<full-path-to-amrex-install-dir> -DKokkos_ROOT=<full-path-to-kokkos-install-dir> -DCMAKE_CUDA_ARCHITECTURES=60 -DMPI_INCLUDE_PATH=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include
+   make -j8
+
+.. _`example in amrex-devtests`: https://github.com/WeiqunZhang/amrex-devtests/tree/main/kokkos
+.. _`Kokkos`: https://github.com/kokkos/kokkos
+.. _`AMReX`: https://github.com/AMReX-Codes/amrex
+.. _`Perlmutter`: https://docs.nersc.gov/systems/perlmutter/architecture/
 
 Kestrel (NREL)
 ~~~~~~~~~~~~~~
diff --git a/Docs/sphinx_doc/figures/GAD_Schematic.png b/Docs/sphinx_doc/figures/GAD_Schematic.png
new file mode 100644
index 000000000..99c662b36
Binary files /dev/null and b/Docs/sphinx_doc/figures/GAD_Schematic.png differ
diff --git a/Docs/sphinx_doc/theory/Buoyancy.rst b/Docs/sphinx_doc/theory/Buoyancy.rst
index dbf289ac2..ee929e644 100644
--- a/Docs/sphinx_doc/theory/Buoyancy.rst
+++ b/Docs/sphinx_doc/theory/Buoyancy.rst
@@ -120,7 +120,7 @@ This expression for buoyancy is from `khairoutdinov2003cloud`_ and `bryan2002ben
 .. math::
 
     \begin{equation}
-    \mathbf{B} = \rho'\mathbf{g} \approx -\rho\Bigg(\frac{T'}{T} + 0.61 q_v' - q_c - q_p - \frac{p'}{p}\Bigg),
+    \mathbf{B} = \rho'\mathbf{g} \approx -\rho\Bigg(\frac{T'}{T} + 0.61 q_v' - q_c - q_p - \frac{p'}{p}\Bigg)\mathbf{g},
     \end{equation}
 
 The derivation follows. The total density is given by :math:`\rho = \rho_d(1 + q_v + q_c + q_p)`, which can be written as
diff --git a/Docs/sphinx_doc/theory/WindFarmModels.rst b/Docs/sphinx_doc/theory/WindFarmModels.rst
index ee041bed2..d4ab0a183 100644
--- a/Docs/sphinx_doc/theory/WindFarmModels.rst
+++ b/Docs/sphinx_doc/theory/WindFarmModels.rst
@@ -4,7 +4,7 @@ Wind farm models
 Introduction
 -------------
 
-ERF supports models for wind farm parametrization in which the effects of wind turbines are represented by imposing a momentum sink on the mean flow and/or turbulent kinetic energy (TKE). Currently the Fitch model (`Fitch et al. 2012`_), Explicit Wake Parametrization (EWP) model (`Volker et al. 2015`_) and Simplified actuator disk model (See Section 3.2 in `Wind Energy Handbook 2nd edition`_) are supported.
+ERF supports models for wind farm parametrization in which the effects of wind turbines are represented by imposing a momentum sink on the mean flow and/or turbulent kinetic energy (TKE). Currently the Fitch model (`Fitch et al. 2012`_), Explicit Wake Parametrization (EWP) model (`Volker et al. 2015`_), Simplified actuator disk model (See Section 3.2 in `Wind Energy Handbook 2nd edition`_), and Generalized actuator disk model (`Mirocha et. al. 2014`_, see Chapter 3 of `Small Wind Turbines`_) are supported.
 
 .. _Fitch model:
 
@@ -126,8 +126,8 @@ The EWP model does not have a concept of intersected area by the turbine rotor l
 
 .. _actuator_disk_model_simplified:
 
-Actuator Disk Model - Simplified
-=================================
+Simplified actuator disk model
+-----------------------------------
 
 A simplified actuator disk model based on one-dimensional momentum theory is implemented (See Section 3.2 in `Wind Energy Handbook 2nd edition`_). A schematic of the actuator disk is shown in  Fig. :numref:`fig:ActuatorDisk_Schematic`.
 The model is implemented as source terms in the equations for the horizontal velocity components (ie. `x` and `y` directions). The thrust force from the one-dimensional momentum theory is given by
@@ -137,7 +137,7 @@ The model is implemented as source terms in the equations for the horizontal vel
     F = 2 \rho \pi R^2 (\mathbf{U}_\infty \cdot \mathbf{n})^2 a (1-a) \\
       = \int_0^{2\pi}\int_0^R 2 \rho (\mathbf{U}_\infty \cdot \mathbf{n})^2 a (1 - a) r\,dr\,d\theta,
 
-where :math:`\rho` is the density of incoming air, :math:`\mathbf{U}_\infty` is the velocity vector of incoming air at some distance (say :math:`d=2.5` times the turbine diameter) upstream of the turbine (see Fig. :numref:`fig:ActuatorDisk_Sampling`), :math:`\mathbf{n}` is the surface normal vector of the actuator disk, and :math:`a = 1 - \cfrac{C_P}{C_T}`, is the axial induction factor for the turbine, and :math:`R` is the radius of the wind turbine swept area. The integration is performed over the swept area of the disk. Hence, the force on an elemental annular disc of thickness :math:`dr` is
+where :math:`\rho` is the density of incoming air, :math:`\mathbf{U}_\infty` is the velocity vector of incoming air at some distance (say :math:`d=2.5` times the turbine diameter) upstream of the turbine (see Fig. :numref:`fig:ActuatorDisk_Sampling`), :math:`\mathbf{n}` is the surface normal vector of the actuator disk, and :math:`a = 1 - \cfrac{C_P}{C_T}`, is the axial induction factor for the turbine, and :math:`R` is the radius of the wind turbine swept area. The integration is performed over the swept area of the disk. Hence, the force on an elemental annular disk of thickness :math:`dr` is
 
 .. math::
 
@@ -181,6 +181,142 @@ where :math:`dA` is the area of the actuator disk in the mesh cell (see Fig. :nu
 
 .. _Inputs:
 
+
+.. _generalized_actuator_disk_model:
+
+Generalized actuator disk model
+------------------------------------
+
+The generalized actuator model (GAD) based on blade element theory (`Mirocha et. al. 2014`_, see Chapter 3 of `Small Wind Turbines`_) is implemented. Similar to the simplified actuator disk model, GAD also models the wind turbine as a disk, but takes into account the details of the blade geometry (see Fig. :numref:`fig:GAD_Schematic`). The forces on the blades in the x, y, z directions are computed, and that contributes to the source terms for the fluid momentum equations. The source terms in a mesh cell inside the actuator disk are given as:
+
+.. math::
+   :label: GAD_source_terms
+
+   \frac{\partial u}{\partial t} &= -\frac{F_x}{\rho \Delta x\Delta y\Delta z} \\
+   \frac{\partial v}{\partial t} &= -\frac{F_y}{\rho \Delta x\Delta y\Delta z} \\
+   \frac{\partial w}{\partial t} &= -\frac{F_z}{\rho \Delta x\Delta y\Delta z},
+
+where :math:`\rho` is the density of air in the cell, and :math:`\Delta x, \Delta y, \Delta z` are the mesh spacing in the x, y, and z directions. The forces on the GAD are given by:
+
+.. math::
+   :label: GAD_forces
+
+   F_x &= F_n \cos{\Phi} + F_t \sin\zeta \sin\Phi \\
+   F_y &= F_n \sin{\Phi} - F_t \sin\zeta \cos\Phi \\
+   F_z &= -F_t \cos\zeta,
+
+where :math:`F_n` and :math:`F_t` are the normal and tangential forces, and the angles are as shown in Figure :numref:`fig:GAD_Schematic`. The normal and tangential forces are:
+
+.. math::
+   :label: GAD_Fn_Ft
+
+   \begin{bmatrix}
+   F_n \\
+   F_t
+   \end{bmatrix}
+   =
+   \begin{bmatrix}
+   \cos\Psi & \sin\Psi \\
+   \sin\Psi & -\cos\Psi
+   \end{bmatrix}
+   \begin{bmatrix}
+   L \\
+   D
+   \end{bmatrix},
+
+where
+
+.. math::
+
+   \Psi = \tan^{-1}\left(\frac{V_n}{V_t}\right),
+
+and
+
+.. math::
+   :label: GAD_Vn_Vt
+
+   V_n &= V_0(1-a_n) \\
+   V_t &= \Omega(1+a_t)r,
+
+where :math:`V_0` is the freestream velocity at a user specified distance upstream from the disk plane as described in Section :ref:`actuator_disk_model_simplified` (also see Fig. :numref:`fig:ActuatorDisk_Sampling`), :math:`\Omega` is the rotational speed of the turbine, :math:`r` is the radial location along the blade span, and :math:`a_n` and :math:`a_t` are the normal and tangential induction factors. The lift and drag forces are given by:
+
+.. math::
+   :label: GAD_L_D
+
+   L &= \frac{1}{2} \rho V_r^2 c C_l \\
+   D &= \frac{1}{2} \rho V_r^2 c C_d,
+
+where :math:`\rho` is the density of air, :math:`c` is the chord length of the airfoil cross-section, :math:`C_l` and :math:`C_d` are the sectional lift and drag coefficients on the airfoil cross-section (which is a function of the incoming angle :math:`\psi`, blade twist :math:`\xi`, and blade pitch :math:`\phi`. See Fig. :numref:`fig:GAD_Schematic`), and the relative wind velocity is :math:`V_r = \sqrt{V_n^2 + V_t^2}`. The normal and tangential sectional coefficients are computed as:
+
+.. math::
+   :label: GAD_Cn_Ct
+
+   \begin{bmatrix}
+   C_n \\
+   C_t
+   \end{bmatrix}
+   =
+   \begin{bmatrix}
+   \cos\Psi & \sin\Psi \\
+   \sin\Psi & -\cos\Psi
+   \end{bmatrix}
+   \begin{bmatrix}
+   C_l \\
+   C_d
+   \end{bmatrix},
+
+and the normal and tangential induction factors are given by:
+
+.. math::
+   :label: GAD_an_at
+
+   a_n &= \left[1 + \frac{4F \sin^2\psi}{s C_n}\right]^{-1} \\
+   a_t &= \left[\frac{4F \sin\psi \cos\psi}{s C_t} - 1\right]^{-1},
+
+where
+
+.. math::
+
+   F = F_\text{tip} + F_\text{hub} = \frac{2}{\pi} \left[\cos^{-1}\left(\exp(-f_\text{tip})\right) + \cos^{-1}\left(\exp(-f_\text{hub})\right)\right],
+
+and
+
+.. math::
+
+   f_\text{tip} &= B \frac{(r_\text{tip}-r)}{2r \sin\psi} \\
+   f_\text{hub} &= B \frac{(r-r_\text{hub})}{2r \sin\psi},
+
+where :math:`r_\text{hub}` and :math:`r_\text{tip}` are the radius of the hub and the blade tip from the center of rotation of the disk, :math:`r` is the radial location along the blade span, and the solidity factor is :math:`s=\frac{cB}{2\pi r}`, where :math:`B` is the number of blades.
+
+An iterative procedure is needed to compute the source terms, and is as follows:
+
+1. An initial value is assumed for the normal and tangential induction factors :math:`a_n` and :math:`a_t`.
+2. Compute the normal and tangential velocities from Eqn. :eq:`GAD_Vn_Vt`. .
+3. From the tables for the `turbine specifications`_, `details of the blade geometry`_ and the `sectional coefficients of the airfoil cross sections`_, compute the values of :math:`C_l` and :math:`C_d` corresponding to the radial location :math:`r` along the blade span and the angle of attack :math:`\alpha = \psi - \xi + \phi`.
+4. Compute the normal and tangential sectional coefficients :math:`C_n` and :math:`C_t` from Eqn. :eq:`GAD_Cn_Ct`.
+5. Compute the normal and tangential induction factors :math:`a_n` and :math:`a_t` using Eqn. :eq:`GAD_an_at`.
+6. Repeat steps 2 to 5 until the error in the normal and tangential induction factors, :math:`a_n` and :math:`a_t`, are less than :math:`1 \times 10^{-5}`.
+7. Once the iterations converge, compute the sectional lift and drag forces, :math:`L` and :math:`D`, using Eqn. :eq:`GAD_L_D`.
+8. Compute the normal and tangential forces, :math:`F_n` and :math:`F_t`, using Eqn. :eq:`GAD_Fn_Ft`.
+9. Compute the forces on the disk using Eqn. :eq:`GAD_forces`.
+10. Compute the source terms in the momentum equation using Eqn. :eq:`GAD_source_terms`.
+
+
+
+.. _fig:GAD_Schematic:
+
+.. figure:: ../figures/GAD_Schematic.png
+   :width: 600
+   :align: center
+
+   Different views of the GAD showing the forces and angles involved: Blade cross section showing the normal (:math:`V_n`) and tangential (:math:`V_t`) components of velocity with the normal (:math:`a_n`) and tangential (:math:`a_t`) induction factors, relative wind velocity :math:`V_r`, blade twist angle :math:`\xi`, angle of relative wind :math:`\psi`, blade pitch angle :math:`\phi`, lift (:math:`L`) and drag (:math:`D`) forces, and normal (:math:`F_n`) and tangential (:math:`F_t`) forces; top view showing the flow direction and inclination angle :math:`\Phi`; and front view showing the actuator disk rotating clockwise.
+
+.. _`Mirocha et. al. 2014`: https://opensky.ucar.edu/islandora/object/articles:13295
+.. _`Small Wind Turbines`: https://doi.org/10.1007/978-1-84996-175-2
+.. _`turbine specifications`: https://github.com/NREL/openfast-turbine-models/blob/main/IEA-scaled/NREL-2.8-127/NREL-2.82-127_performance.csv
+.. _`details of the blade geometry`: https://github.com/NREL/openfast-turbine-models/blob/main/IEA-scaled/NREL-2.8-127/20_monolithic_opt2/OpenFAST/NREL-2p8-127_AeroDyn15_blade.dat
+.. _`sectional coefficients of the airfoil cross sections` : https://github.com/NREL/openfast-turbine-models/tree/main/IEA-scaled/NREL-2.8-127/20_monolithic_opt2/OpenFAST/Airfoils
+
 Inputs for wind farm parametrization models
 ------------------------------------------------------------
 
@@ -209,7 +345,7 @@ The following are the inputs required for wind farm simulations.
     // have the same specifications
     erf.windfarm_spec_table = "wind-turbine_1WT.tbl"
 
-    // For simplified actuator disk model the following parameters are needed
+    // In addition to the above, for simplified actuator disk model the following parameters are needed
 
     // The distance of the freestream velocity sampling disk from the turbine actuator
     // disk
@@ -218,6 +354,33 @@ The following are the inputs required for wind farm simulations.
     // The angle of the turbine actuator disk from the x axis
     erf.turb_disk_angle_from_x = 135.0
 
+    // In addition to the above, for generalized actuator disk model the following parameters are needed
+
+    // Table containing additional specification information of the wind turbine.
+    // See Note below
+    erf.windfarm_spec_table_extra = "NREL-2.82-127_performance.csv"
+
+    // Table containing details of blade geometry
+    // See Note below
+    erf.windfarm_blade_table = "NREL-2p8-127_AeroDyn15_blade.dat"
+
+    // Tables containing the sectional lift and drag coefficients for the
+    // blade airfoil cross-sections.
+    // See Note below.
+    erf.windfarm_airfoil_tables = "Airfoils"
+
+.. note::
+
+   The format for the files required for the generalized actuator disk model are
+
+   1. erf.windfarm_spec_table_extra = `NREL-2.82-127_performance.csv`_
+   2. erf.windfarm_blade_table = `NREL-2p8-127_AeroDyn15_blade.dat`_
+   3. erf.windfarm_airfoil_tables = `Airfoils`_.
+
+.. _`NREL-2.82-127_performance.csv`: https://github.com/NREL/openfast-turbine-models/blob/main/IEA-scaled/NREL-2.8-127/NREL-2.82-127_performance.csv
+.. _`NREL-2p8-127_AeroDyn15_blade.dat`: https://github.com/NREL/openfast-turbine-models/blob/main/IEA-scaled/NREL-2.8-127/20_monolithic_opt2/OpenFAST/NREL-2p8-127_AeroDyn15_blade.dat
+.. _`Airfoils`: https://github.com/NREL/openfast-turbine-models/tree/main/IEA-scaled/NREL-2.8-127/20_monolithic_opt2/OpenFAST/Airfoils
+
 1. ``erf.windfarm_type`` has to be one of the supported models - ``Fitch``, ``EWP``, ``SimpleActuatorDisk``.
 2. ``erf.windfarm_loc_type`` is a variable to specify how the wind turbine locations in the wind farm is specified. If using the latitude and longitude of the turbine location, this has to be ``lat_lon`` or if using x and y coordinates to specify the turbine locations, this input is ``x_y``.
 
diff --git a/Exec/ABL/inputs_DataSampler b/Exec/ABL/inputs_DataSampler
new file mode 100644
index 000000000..ac46484c4
--- /dev/null
+++ b/Exec/ABL/inputs_DataSampler
@@ -0,0 +1,74 @@
+# ------------------  INPUTS TO MAIN PROGRAM  -------------------
+max_step = 4000
+
+amrex.fpe_trap_invalid = 1
+
+fabarray.mfiter_tile_size = 1024 1024 1024
+
+# PROBLEM SIZE & GEOMETRY
+geometry.prob_extent =  1024     1024    1024
+amr.n_cell           =    64      64     64
+
+geometry.is_periodic = 1 1 0
+
+zlo.type = "NoSlipWall"
+zhi.type = "SlipWall"
+
+# TIME STEP CONTROL
+erf.fixed_dt           = 0.1  # fixed time step depending on grid resolution
+
+# DIAGNOSTICS & VERBOSITY
+erf.sum_interval   = 1       # timesteps between computing mass
+erf.v              = 1       # verbosity in ERF.cpp
+amr.v              = 1       # verbosity in Amr.cpp
+
+erf.data_log = my_data_file
+
+erf.sampler_interval = 1
+erf.do_line_sampling = true
+erf.sample_line_lo   = 5 32  5 10   32 5
+erf.sample_line_hi   = 5 32 25 1000 32 5
+erf.sample_line_dir  = 2 0
+
+erf.do_plane_sampling = true
+erf.sample_plane_lo   =  48.0  48.0  32.0
+erf.sample_plane_hi   = 320.0 320.0  32.0 
+erf.sample_plane_dir  = 2
+
+# REFINEMENT / REGRIDDING
+amr.max_level       = 0       # maximum level number allowed
+
+# CHECKPOINT FILES
+erf.check_file      = chk        # root name of checkpoint file
+erf.check_int       = 100        # number of timesteps between checkpoints
+
+# PLOTFILES
+erf.plot_file_1     = plt        # prefix of plotfile name
+erf.plot_int_1      = 10         # number of timesteps between plotfiles
+erf.plot_vars_1     = density rhoadv_0 x_velocity y_velocity z_velocity pressure temp theta
+
+# SOLVER CHOICE
+erf.alpha_T = 0.0
+erf.alpha_C = 1.0
+erf.use_gravity = false
+
+erf.molec_diff_type = "None"
+erf.les_type        = "Smagorinsky"
+erf.Cs              = 0.1
+
+erf.init_type = "uniform"
+
+# PROBLEM PARAMETERS
+prob.rho_0 = 1.0
+prob.A_0 = 1.0
+
+prob.U_0 = 10.0
+prob.V_0 = 0.0
+prob.W_0 = 0.0
+prob.T_0 = 300.0
+
+# Higher values of perturbations lead to instability
+# Instability seems to be coming from BC
+prob.U_0_Pert_Mag = 0.08
+prob.V_0_Pert_Mag = 0.08 #
+prob.W_0_Pert_Mag = 0.0
diff --git a/Exec/AWAKEN/inputs_All_EWP b/Exec/AWAKEN/inputs_All_EWP
index e5b6f096f..cda4dff42 100644
--- a/Exec/AWAKEN/inputs_All_EWP
+++ b/Exec/AWAKEN/inputs_All_EWP
@@ -49,7 +49,6 @@ yhi.type = "Outflow"
 
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.125  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/AWAKEN/inputs_KingPlains_EWP b/Exec/AWAKEN/inputs_KingPlains_EWP
index 26ea72b16..653b75e3c 100644
--- a/Exec/AWAKEN/inputs_KingPlains_EWP
+++ b/Exec/AWAKEN/inputs_KingPlains_EWP
@@ -39,7 +39,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.125  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/AWAKEN/inputs_KingPlains_Fitch b/Exec/AWAKEN/inputs_KingPlains_Fitch
index 2e7ae4565..169f2a8d5 100644
--- a/Exec/AWAKEN/inputs_KingPlains_Fitch
+++ b/Exec/AWAKEN/inputs_KingPlains_Fitch
@@ -39,7 +39,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.125  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/AWAKEN/inputs_KingPlains_SimpleAD b/Exec/AWAKEN/inputs_KingPlains_SimpleAD
index 0cccad7aa..9b1822545 100644
--- a/Exec/AWAKEN/inputs_KingPlains_SimpleAD
+++ b/Exec/AWAKEN/inputs_KingPlains_SimpleAD
@@ -41,7 +41,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.125  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/CMakeLists.txt b/Exec/CMakeLists.txt
index fd00cc297..a5b384ebd 100644
--- a/Exec/CMakeLists.txt
+++ b/Exec/CMakeLists.txt
@@ -20,7 +20,6 @@ elseif (ERF_ENABLE_REGRESSION_TESTS_ONLY)
 else ()
   add_subdirectory(ABL)
   add_subdirectory(SuperCell)
-  add_subdirectory(Radiation)
   add_subdirectory(SquallLine_2D)
   add_subdirectory(RegTests/Bubble)
   add_subdirectory(RegTests/Couette_Poiseuille)
@@ -35,9 +34,10 @@ else ()
   add_subdirectory(RegTests/WPS_Test)
   add_subdirectory(RegTests/Bomex)
   add_subdirectory(RegTests/TurbulentInflow)
-  add_subdirectory(DevTests/MovingTerrain)
-  add_subdirectory(DevTests/MetGrid)
   add_subdirectory(DevTests/LandSurfaceModel)
+  add_subdirectory(DevTests/MetGrid)
+  add_subdirectory(DevTests/MovingTerrain)
+  add_subdirectory(DevTests/Radiation)
   add_subdirectory(DevTests/TemperatureSource)
   add_subdirectory(DevTests/TropicalCyclone)
 endif()
diff --git a/Exec/DevTests/NoahMP/CMakeLists.txt b/Exec/DevTests/NoahMP/CMakeLists.txt
new file mode 100644
index 000000000..995474bb5
--- /dev/null
+++ b/Exec/DevTests/NoahMP/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(erf_exe_name erf_noahmp)
+add_executable(${erf_exe_name} "")
+include(${CMAKE_SOURCE_DIR}/CMake/BuildERFExe.cmake)
+build_erf_exe(${erf_exe_name})
diff --git a/Exec/DevTests/NoahMP/GNUmakefile b/Exec/DevTests/NoahMP/GNUmakefile
new file mode 100644
index 000000000..04aafbb6d
--- /dev/null
+++ b/Exec/DevTests/NoahMP/GNUmakefile
@@ -0,0 +1,33 @@
+# AMReX
+COMP = gnu
+PRECISION = DOUBLE
+
+# Profiling
+PROFILE       = FALSE
+TINY_PROFILE  = FALSE
+COMM_PROFILE  = FALSE
+TRACE_PROFILE = FALSE
+MEM_PROFILE   = FALSE
+USE_GPROF     = FALSE
+
+# Performance
+USE_MPI  = TRUE
+USE_OMP  = FALSE
+
+USE_CUDA = FALSE
+USE_HIP  = FALSE
+USE_SYCL = FALSE
+
+# Debugging
+DEBUG = FALSE
+
+# Land model
+USE_NETCDF = TRUE
+USE_NOAH = TRUE
+
+# GNU Make
+Bpack := ./Make.package
+Blocs := .
+ERF_HOME := ../../..
+ERF_PROBLEM_DIR = $(ERF_HOME)/Exec/ABL
+include $(ERF_HOME)/Exec/Make.ERF
diff --git a/Exec/Radiation/CMakeLists.txt b/Exec/DevTests/Radiation/CMakeLists.txt
similarity index 100%
rename from Exec/Radiation/CMakeLists.txt
rename to Exec/DevTests/Radiation/CMakeLists.txt
diff --git a/Exec/Radiation/ERF_prob.H b/Exec/DevTests/Radiation/ERF_prob.H
similarity index 100%
rename from Exec/Radiation/ERF_prob.H
rename to Exec/DevTests/Radiation/ERF_prob.H
diff --git a/Exec/Radiation/ERF_prob.cpp b/Exec/DevTests/Radiation/ERF_prob.cpp
similarity index 100%
rename from Exec/Radiation/ERF_prob.cpp
rename to Exec/DevTests/Radiation/ERF_prob.cpp
diff --git a/Exec/Radiation/GNUmakefile b/Exec/DevTests/Radiation/GNUmakefile
similarity index 100%
rename from Exec/Radiation/GNUmakefile
rename to Exec/DevTests/Radiation/GNUmakefile
diff --git a/Exec/Radiation/Make.package b/Exec/DevTests/Radiation/Make.package
similarity index 100%
rename from Exec/Radiation/Make.package
rename to Exec/DevTests/Radiation/Make.package
diff --git a/Exec/Radiation/README b/Exec/DevTests/Radiation/README
similarity index 100%
rename from Exec/Radiation/README
rename to Exec/DevTests/Radiation/README
diff --git a/Exec/Radiation/inputs_radiation b/Exec/DevTests/Radiation/inputs_radiation
similarity index 98%
rename from Exec/Radiation/inputs_radiation
rename to Exec/DevTests/Radiation/inputs_radiation
index 9a3fdcf60..12fcd8e87 100644
--- a/Exec/Radiation/inputs_radiation
+++ b/Exec/DevTests/Radiation/inputs_radiation
@@ -18,7 +18,6 @@ zlo.type = "SlipWall"
 zhi.type = "SlipWall"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.25     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/EWP/inputs_1WT_lat_lon b/Exec/EWP/inputs_1WT_lat_lon
index fe77134fb..215618298 100644
--- a/Exec/EWP/inputs_1WT_lat_lon
+++ b/Exec/EWP/inputs_1WT_lat_lon
@@ -51,7 +51,6 @@ xlo.theta    = 300.
 
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 3.0  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/EWP/inputs_1WT_x_y b/Exec/EWP/inputs_1WT_x_y
index ee14f42cf..e9c1296ef 100644
--- a/Exec/EWP/inputs_1WT_x_y
+++ b/Exec/EWP/inputs_1WT_x_y
@@ -49,7 +49,6 @@ xlo.theta    = 300.
 
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 3.0  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/EWP/inputs_WindFarm_lat_lon b/Exec/EWP/inputs_WindFarm_lat_lon
index b3a549ceb..0b4b92a4b 100644
--- a/Exec/EWP/inputs_WindFarm_lat_lon
+++ b/Exec/EWP/inputs_WindFarm_lat_lon
@@ -36,7 +36,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/EWP/inputs_WindFarm_x_y b/Exec/EWP/inputs_WindFarm_x_y
index 19b931695..1f102dc4d 100644
--- a/Exec/EWP/inputs_WindFarm_x_y
+++ b/Exec/EWP/inputs_WindFarm_x_y
@@ -34,7 +34,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/Make.ERF b/Exec/Make.ERF
index 6c8a8328e..7aca7d619 100644
--- a/Exec/Make.ERF
+++ b/Exec/Make.ERF
@@ -112,7 +112,9 @@ AMReXdirs             += EB
 endif
 
 ifeq ($(USE_POISSON_SOLVE),TRUE)
-AMReXdirs             += LinearSolvers/MLMG
+USE_LINEAR_SOLVERS_INCFLO = FALSE
+USE_LINEAR_SOLVERS_EM     = FALSE
+AMReXdirs                += LinearSolvers
 endif
 
 ifeq ($(USE_HDF5),TRUE)
@@ -181,22 +183,8 @@ INCLUDE_LOCATIONS += $(ERF_WINDFARM_GENERALAD_DIR)
 
 endif
 
-ifeq ($(USE_HEFFTE),TRUE)
-DEFINES += -DERF_USE_HEFFTE
-VPATH_LOCATIONS   += $(HEFFTE_HOME)/include
-INCLUDE_LOCATIONS += $(HEFFTE_HOME)/include
-LIBRARY_LOCATIONS += $(HEFFTE_HOME)/lib
-LIBRARIES += -lheffte
-ifeq ($(USE_CUDA),TRUE)
-  libraries += -lcufft
-else ifeq ($(USE_HIP),TRUE)
-  # Use rocFFT.  ROC_PATH is defined in amrex
-  INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include
-  LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
-  LIBRARIES += -L$(ROC_PATH)/rocfft/lib -lrocfft
-else
-  libraries += -lfftw3_mpi -lfftw3f -lfftw3
-endif
+ifeq ($(USE_FFT),TRUE)
+DEFINES += -DERF_USE_FFT
 endif
 
 ifeq ($(USE_WW3_COUPLING), TRUE)
@@ -218,6 +206,25 @@ include $(ERF_LSM_SLM_DIR)/Make.package
 VPATH_LOCATIONS   += $(ERF_LSM_SLM_DIR)
 INCLUDE_LOCATIONS += $(ERF_LSM_SLM_DIR)
 
+# If using NOAH-MP model, then compile relevant source and headers
+ifeq ($(USE_NOAH), TRUE)
+  ifneq ($(USE_NETCDF), TRUE)
+    $(error USE_NETCDF must be true for using NOAH-MP interface)
+  else
+    DEFINES += -DERF_USE_NOAH
+    includes += $(shell pkg-config --cflags netcdf-fortran)
+    LIBRARIES += $(shell pkg-config --libs netcdf-fortran)
+    NOAH_HOME  ?= $(ERF_HOME)/Submodules/NOAH-MP
+    VPATH_LOCATIONS += $(NOAH_HOME)/drivers/hrldas
+    INCLUDE_LOCATIONS += $(NOAH_HOME)/drivers/hrldas
+    ERF_LSM_NOAH_DIR = $(ERF_SOURCE_DIR)/LandSurfaceModel/NOAH
+    include $(ERF_LSM_NOAH_DIR)/Make.package
+    VPATH_LOCATIONS   += $(ERF_LSM_NOAH_DIR)
+    INCLUDE_LOCATIONS += $(ERF_LSM_NOAH_DIR)
+  endif
+endif
+
+
 ERF_LSM_MM5_DIR = $(ERF_SOURCE_DIR)/LandSurfaceModel/MM5
 include $(ERF_LSM_MM5_DIR)/Make.package
 VPATH_LOCATIONS   += $(ERF_LSM_MM5_DIR)
diff --git a/Exec/RegTests/DensityCurrent/inputs_amr b/Exec/RegTests/DensityCurrent/inputs_amr
new file mode 100644
index 000000000..11a5373d4
--- /dev/null
+++ b/Exec/RegTests/DensityCurrent/inputs_amr
@@ -0,0 +1,87 @@
+# ------------------  INPUTS TO MAIN PROGRAM  -------------------
+stop_time = 900.0
+
+amr.blocking_factor_x = 16
+amr.blocking_factor_y = 1
+amr.blocking_factor_z = 16
+
+#erf.anelastic   = 1
+#erf.check_file  = chka
+#erf.plot_file_1 = plta
+
+erf.anelastic   = 0
+erf.check_file  = chk
+erf.plot_file_1 = plt
+
+erf.buoyancy_type = 2
+
+amrex.fpe_trap_invalid = 1
+
+fabarray.mfiter_tile_size = 1024 1024 1024
+
+# PROBLEM SIZE & GEOMETRY
+geometry.prob_lo     =      0.   0.    0.
+geometry.prob_hi     =  25600. 100.  6400.
+
+xlo.type = "Symmetry"
+xhi.type = "Outflow"
+
+zlo.type = "SlipWall"
+zhi.type = "SlipWall"
+
+geometry.is_periodic = 0 1 0
+
+amr.max_level       = 1       # maximum level number allowed
+
+amr.n_cell         = 128   1      32   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
+erf.fixed_dt       = 2.0       # fixed time step [s] -- Straka et al 1993
+erf.fixed_fast_dt  = 0.5       # fixed time step [s] -- Straka et al 1993
+erf.plot_int_1     = 100       # number of timesteps between plotfiles
+erf.check_int      =-1000      # number of timesteps between checkpoints
+
+# DIAGNOSTICS & VERBOSITY
+erf.sum_interval   =-1       # timesteps between computing mass
+erf.v              = 0       # verbosity in ERF.cpp
+amr.v              = 1       # verbosity in Amr.cpp
+
+# CHECKPOINT FILES
+
+# PLOTFILES
+erf.plotfile_type   = amrex
+erf.plot_vars_1     = density x_velocity y_velocity z_velocity pressure theta pres_hse dens_hse pert_pres pert_dens
+
+# SOLVER CHOICE
+erf.use_gravity  = true
+erf.use_coriolis = false
+erf.use_terrain  = false
+
+erf.les_type         = "None"
+#
+# Diffusion coefficient from Straka, K = 75 m^2/s
+#
+erf.molec_diff_type  = "ConstantAlpha" # where alpha == "K" in Straka et al 1993
+erf.rho0_trans       = 1.0 # [kg/m^3], used to convert input diffusivities
+erf.dynamicViscosity = 75.0 # [kg/(m-s)] ==> alpha = 75.0 m^2/s
+erf.alpha_T          = 75.0 # [m^2/s]
+
+erf.c_p = 1004.0
+
+# PROBLEM PARAMETERS (optional)
+prob.T_0 = 300.0
+prob.U_0 = 0.0
+
+################################ MULTILEVEL ################################
+amr.ref_ratio_vect = 2 1 2
+
+erf.coupling_type = "OneWay"
+erf.regrid_int = 10
+
+erf.refinement_indicators = lo_theta
+
+erf.lo_theta.max_level     = 1
+erf.lo_theta.field_name    = theta
+erf.lo_theta.value_less    = 299.0
+
+amr.n_error_buf  = 6 6
+amr.grid_eff     = 0.8
+################################ MULTILEVEL ################################
diff --git a/Exec/RegTests/DensityCurrent/inputs_anelastic b/Exec/RegTests/DensityCurrent/inputs_anelastic
deleted file mode 100644
index b555097ca..000000000
--- a/Exec/RegTests/DensityCurrent/inputs_anelastic
+++ /dev/null
@@ -1,86 +0,0 @@
-# ------------------  INPUTS TO MAIN PROGRAM  -------------------
-stop_time = 900.0
-
-erf.anelastic      = 1
-
-erf.use_terrain = false
-
-amrex.fpe_trap_invalid = 1
-
-fabarray.mfiter_tile_size = 1024 1024 1024
-
-#SYMMETRY / OUTFLOW VERSION
-geometry.prob_lo     =      0.   0.    0.
-geometry.prob_hi     =  25600.  100. 6400.
-geometry.is_periodic = 0 1 0
-amr.n_cell           =    256     1    64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
-xlo.type = "Symmetry"
-xhi.type = "HO_Outflow"
-
-#DOUBLY PERIODIC VERSION
-#geometry.prob_lo     = -25600.   0.    0.
-#geometry.prob_hi     =  25600.  100. 6400.
-#geometry.is_periodic =  1 1 0
-#amr.n_cell           =  512     1    64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
-
-zlo.type = "SlipWall"
-zhi.type = "SlipWall"
-
-# TIME STEP CONTROL
-erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
-
-# DIAGNOSTICS & VERBOSITY
-erf.sum_interval   = 1       # timesteps between computing mass
-erf.v              = 1       # verbosity in ERF.cpp
-amr.v              = 1       # verbosity in Amr.cpp
-erf.mg_v           = 1       # verbosity in ERF.cpp
-
-# REFINEMENT / REGRIDDING
-amr.max_level       = 0       # maximum level number allowed
-
-# CHECKPOINT FILES
-erf.check_file      = chk     # root name of checkpoint file
-erf.check_int       = -1000   # number of timesteps between checkpoints
-
-# PLOTFILES
-erf.plotfile_type   = netcdf     # prefix of plotfile name
-erf.plotfile_type   = amrex     # prefix of plotfile name
-erf.plot_file_1     = plta       # prefix of plotfile name
-erf.plot_int_1      = 100        # number of timesteps between plotfiles
-erf.plot_vars_1     = density x_velocity y_velocity z_velocity pressure theta pres_hse dens_hse pert_pres pert_dens 
-
-# SOLVER CHOICE
-erf.use_gravity = true
-erf.use_coriolis = false
-
-erf.les_type         = "None"
-#
-# Diffusion coefficient from Straka, K = 75 m^2/s
-#
-erf.molec_diff_type  = "ConstantAlpha" # where alpha == "K" in Straka et al 1993
-erf.rho0_trans       = 1.0 # [kg/m^3], used to convert input diffusivities
-erf.dynamicViscosity = 75.0 # [kg/(m-s)] ==> alpha = 75.0 m^2/s
-erf.alpha_T          = 75.0 # [m^2/s]
-
-erf.c_p = 1004.0
-
-# PROBLEM PARAMETERS (optional)
-prob.T_0 = 300.0
-prob.U_0 = 0.0
-
-################################ MULTILEVEL ################################
-amr.max_level = 0
-amr.ref_ratio_vect = 2 1 2
-
-erf.coupling_type = "TwoWay"
-erf.regrid_int = 2
-
-erf.refinement_indicators = lo_theta
-
-erf.lo_theta.max_level     = 1
-erf.lo_theta.field_name    = theta
-erf.lo_theta.value_less    = 299.9
-
-amr.n_error_buf  = 5 5
-amr.grid_eff     = 0.8
-################################ MULTILEVEL ################################
diff --git a/Exec/RegTests/DensityCurrent/inputs_crse_halfdomain b/Exec/RegTests/DensityCurrent/inputs_crse_halfdomain
index a1ad57079..4a0a211fc 100644
--- a/Exec/RegTests/DensityCurrent/inputs_crse_halfdomain
+++ b/Exec/RegTests/DensityCurrent/inputs_crse_halfdomain
@@ -1,7 +1,13 @@
 # ------------------  INPUTS TO MAIN PROGRAM  -------------------
 stop_time = 900.0
 
-erf.use_terrain = false
+#erf.anelastic   = 1
+#erf.check_file  = chka
+#erf.plot_file_1 = plta
+
+erf.anelastic   = 0
+erf.check_file  = chk
+erf.plot_file_1 = plt
 
 erf.buoyancy_type = 2
 
@@ -16,38 +22,49 @@ geometry.prob_hi     =  25600. 100.  6400.
 xlo.type = "Symmetry"
 xhi.type = "Outflow"
 
-geometry.is_periodic = 0 1 0
-
-amr.n_cell           =   256   1      64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
-
 zlo.type = "SlipWall"
 zhi.type = "SlipWall"
 
-# TIME STEP CONTROL
-erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
-erf.fixed_fast_dt  = 0.25     # fixed time step [s] -- Straka et al 1993
+geometry.is_periodic = 0 1 0
+
+amr.max_level       = 0       # maximum level number allowed
+
+#fine
+#amr.n_cell         = 512 1  128 # dx=dy=dz=50 m, Straka et al 1993 / Xue et al 2000
+#erf.fixed_dt       = 0.5        # fixed time step [s] -- Straka et al 1993
+#erf.fixed_fast_dt  = 0.125      # fixed time step [s] -- Straka et al 1993
+#erf.plot_int_1     = 200        # number of timesteps between plotfiles
+#erf.check_int      = 200        # number of timesteps between checkpoints
+
+#crse
+amr.n_cell         = 256 1 64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
+erf.fixed_dt       = 1.0        # fixed time step [s] -- Straka et al 1993
+erf.fixed_fast_dt  = 0.25       # fixed time step [s] -- Straka et al 1993
+erf.plot_int_1     = 300        # number of timesteps between plotfiles
+erf.check_int      =-1000       # number of timesteps between checkpoints
+
+#crser
+#amr.n_cell         = 128 1  32  # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
+#erf.fixed_dt       = 2.0        # fixed time step [s] -- Straka et al 1993
+#erf.fixed_fast_dt  = 0.5        # fixed time step [s] -- Straka et al 1993
+#erf.plot_int_1     = 150        # number of timesteps between plotfiles
+#erf.check_int      = 1          # number of timesteps between checkpoints
 
 # DIAGNOSTICS & VERBOSITY
-erf.sum_interval   = 1       # timesteps between computing mass
+erf.sum_interval   =-1       # timesteps between computing mass
 erf.v              = 1       # verbosity in ERF.cpp
 amr.v              = 1       # verbosity in Amr.cpp
 
-# REFINEMENT / REGRIDDING
-amr.max_level       = 0       # maximum level number allowed
-
 # CHECKPOINT FILES
-erf.check_file      = chk        # root name of checkpoint file
-erf.check_int       = -1000      # number of timesteps between checkpoints
 
 # PLOTFILES
 erf.plotfile_type   = amrex       # prefix of plotfile name
-erf.plot_file_1     = pltc       # prefix of plotfile name
-erf.plot_int_1      = 1000       # number of timesteps between plotfiles
 erf.plot_vars_1     = density x_velocity y_velocity z_velocity pressure theta pres_hse dens_hse pert_pres pert_dens
 
 # SOLVER CHOICE
-erf.use_gravity = true
+erf.use_gravity  = true
 erf.use_coriolis = false
+erf.use_terrain  = false
 
 erf.les_type         = "None"
 #
@@ -63,20 +80,3 @@ erf.c_p = 1004.0
 # PROBLEM PARAMETERS (optional)
 prob.T_0 = 300.0
 prob.U_0 = 0.0
-
-################################ MULTILEVEL ################################
-amr.max_level = 0
-amr.ref_ratio_vect = 2 1 2
-
-erf.coupling_type = "TwoWay"
-erf.regrid_int = 2
-
-erf.refinement_indicators = lo_theta
-
-erf.lo_theta.max_level     = 1
-erf.lo_theta.field_name    = theta
-erf.lo_theta.value_less    = 299.9
-
-amr.n_error_buf  = 5 5
-amr.grid_eff     = 0.8
-################################ MULTILEVEL ################################
diff --git a/Exec/RegTests/DensityCurrent/inputs_crse_outflow b/Exec/RegTests/DensityCurrent/inputs_crse_outflow
deleted file mode 100644
index d94b3bb2a..000000000
--- a/Exec/RegTests/DensityCurrent/inputs_crse_outflow
+++ /dev/null
@@ -1,64 +0,0 @@
-# ------------------  INPUTS TO MAIN PROGRAM  -------------------
-max_step = 999999
-stop_time = 900.0
-
-amrex.fpe_trap_invalid = 1
-
-fabarray.mfiter_tile_size = 1024 1024 1024
-
-# PROBLEM SIZE & GEOMETRY
-geometry.prob_lo     = -25600.   0.    0.
-geometry.prob_hi     =  25600. 400. 6400.
-
-amr.n_cell           =  2048   4    256   # dx=dy=dz=25 m, Straka et al 1993 / Xue et al 2000
-amr.n_cell           =   512   4     64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
-
-# periodic in x to match WRF setup
-# - as an alternative, could use symmetry at x=0 and outflow at x=25600
-geometry.is_periodic = 0 1 0
-
-xlo.type = "Outflow"
-xhi.type = "Outflow"
-
-zlo.type = "SlipWall"
-zhi.type = "SlipWall"
-
-# TIME STEP CONTROL
-erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
-erf.fixed_fast_dt  = 0.25     # fixed time step [s] -- Straka et al 1993
-
-# DIAGNOSTICS & VERBOSITY
-erf.sum_interval   = 1       # timesteps between computing mass
-erf.v              = 1       # verbosity in ERF.cpp
-amr.v              = 1       # verbosity in Amr.cpp
-
-# REFINEMENT / REGRIDDING
-amr.max_level       = 0       # maximum level number allowed
-
-# CHECKPOINT FILES
-erf.check_file      = chk        # root name of checkpoint file
-erf.check_int       = -57600      # number of timesteps between checkpoints
-
-# PLOTFILES
-erf.plot_file_1     = plt        # prefix of plotfile name
-erf.plot_int_1      = 1000       # number of timesteps between plotfiles
-erf.plot_vars_1     = density x_velocity y_velocity z_velocity pressure theta pres_hse dens_hse pert_pres pert_dens
-
-# SOLVER CHOICE
-erf.use_gravity = true
-erf.use_coriolis = false
-
-erf.les_type         = "None"
-#
-# Diffusion coefficient from Straka, K = 75 m^2/s
-#
-erf.molec_diff_type  = "ConstantAlpha" # where alpha == "K" in Straka et al 1993
-erf.rho0_trans       = 1.0 # [kg/m^3], used to convert input diffusivities
-erf.dynamicViscosity = 75.0 # [kg/(m-s)] ==> alpha = 75.0 m^2/s
-erf.alpha_T          = 75.0 # [m^2/s]
-
-erf.c_p = 1004.0
-
-# PROBLEM PARAMETERS (optional)
-prob.T_0 = 300.0
-prob.U_0 = 0.0
diff --git a/Exec/RegTests/DensityCurrent/inputs_crse_periodic b/Exec/RegTests/DensityCurrent/inputs_crse_periodic
deleted file mode 100644
index 776f36d0c..000000000
--- a/Exec/RegTests/DensityCurrent/inputs_crse_periodic
+++ /dev/null
@@ -1,60 +0,0 @@
-# ------------------  INPUTS TO MAIN PROGRAM  -------------------
-max_step = 999999
-stop_time = 900.0
-
-amrex.fpe_trap_invalid = 1
-
-fabarray.mfiter_tile_size = 1024 1024 1024
-
-# PROBLEM SIZE & GEOMETRY
-geometry.prob_lo     = -25600.   0.    0.
-geometry.prob_hi     =  25600. 400. 6400.
-
-amr.n_cell           =  2048   4    256   # dx=dy=dz=25 m, Straka et al 1993 / Xue et al 2000
-amr.n_cell           =   512   4     64   # dx=dy=dz=100 m, Straka et al 1993 / Xue et al 2000
-
-# periodic in x to match WRF setup
-# - as an alternative, could use symmetry at x=0 and outflow at x=25600
-geometry.is_periodic = 1 1 0
-zlo.type = "SlipWall"
-zhi.type = "SlipWall"
-
-# TIME STEP CONTROL
-erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
-erf.fixed_fast_dt  = 0.25     # fixed time step [s] -- Straka et al 1993
-
-# DIAGNOSTICS & VERBOSITY
-erf.sum_interval   = 1       # timesteps between computing mass
-erf.v              = 1       # verbosity in ERF.cpp
-amr.v              = 1       # verbosity in Amr.cpp
-
-# REFINEMENT / REGRIDDING
-amr.max_level       = 0       # maximum level number allowed
-
-# CHECKPOINT FILES
-erf.check_file      = chk        # root name of checkpoint file
-erf.check_int       = -57600      # number of timesteps between checkpoints
-
-# PLOTFILES
-erf.plot_file_1     = plt        # prefix of plotfile name
-erf.plot_int_1      = 1000       # number of timesteps between plotfiles
-erf.plot_vars_1     = density x_velocity y_velocity z_velocity pressure theta pres_hse dens_hse pert_pres pert_dens
-
-# SOLVER CHOICE
-erf.use_gravity = true
-erf.use_coriolis = false
-
-erf.les_type         = "None"
-#
-# Diffusion coefficient from Straka, K = 75 m^2/s
-#
-erf.molec_diff_type  = "ConstantAlpha" # where alpha == "K" in Straka et al 1993
-erf.rho0_trans       = 1.0 # [kg/m^3], used to convert input diffusivities
-erf.dynamicViscosity = 75.0 # [kg/(m-s)] ==> alpha = 75.0 m^2/s
-erf.alpha_T          = 75.0 # [m^2/s]
-
-erf.c_p = 1004.0
-
-# PROBLEM PARAMETERS (optional)
-prob.T_0 = 300.0
-prob.U_0 = 0.0
diff --git a/Exec/RegTests/TaylorGreenVortex/inputs_multilevel b/Exec/RegTests/TaylorGreenVortex/inputs_multilevel
index e4cb00b3d..e53115628 100644
--- a/Exec/RegTests/TaylorGreenVortex/inputs_multilevel
+++ b/Exec/RegTests/TaylorGreenVortex/inputs_multilevel
@@ -18,8 +18,6 @@ zhi.type = "SlipWall"
 erf.fixed_dt           = 4e-2    # fixed time step
 erf.mri_fixed_dt_ratio = 4
 
-erf.use_native_mri     = 0
-
 # DIAGNOSTICS & VERBOSITY
 erf.sum_interval   = 1       # timesteps between computing mass
 erf.v              = 1       # verbosity in ERF.cpp
diff --git a/Exec/SimpleActuatorDisk/inputs_1WT_lat_lon b/Exec/SimpleActuatorDisk/inputs_1WT_lat_lon
index a5d799cfd..ff237b1cc 100644
--- a/Exec/SimpleActuatorDisk/inputs_1WT_lat_lon
+++ b/Exec/SimpleActuatorDisk/inputs_1WT_lat_lon
@@ -51,7 +51,6 @@ xlo.theta    = 300.
 
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 3.0  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/SimpleActuatorDisk/inputs_1WT_x_y b/Exec/SimpleActuatorDisk/inputs_1WT_x_y
index 6a7986956..4796a2cf4 100644
--- a/Exec/SimpleActuatorDisk/inputs_1WT_x_y
+++ b/Exec/SimpleActuatorDisk/inputs_1WT_x_y
@@ -49,7 +49,6 @@ xlo.theta    = 300.
 
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.1  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/SimpleActuatorDisk/inputs_WindFarm_lat_lon b/Exec/SimpleActuatorDisk/inputs_WindFarm_lat_lon
index e6ebae87b..8fe3e2e82 100644
--- a/Exec/SimpleActuatorDisk/inputs_WindFarm_lat_lon
+++ b/Exec/SimpleActuatorDisk/inputs_WindFarm_lat_lon
@@ -36,7 +36,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/SimpleActuatorDisk/inputs_WindFarm_x_y b/Exec/SimpleActuatorDisk/inputs_WindFarm_x_y
index 7fbed1830..72e9af156 100644
--- a/Exec/SimpleActuatorDisk/inputs_WindFarm_x_y
+++ b/Exec/SimpleActuatorDisk/inputs_WindFarm_x_y
@@ -34,7 +34,6 @@ ylo.type = "Outflow"
 yhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25  # fixed time step depending on grid resolution
 #erf.fixed_fast_dt  = 0.0025
 
diff --git a/Exec/SquallLine_2D/inputs_ml b/Exec/SquallLine_2D/inputs_ml
index 9169252c4..4a4f50617 100644
--- a/Exec/SquallLine_2D/inputs_ml
+++ b/Exec/SquallLine_2D/inputs_ml
@@ -20,7 +20,6 @@ zlo.type = "SlipWall"
 zhi.type = "SlipWall"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.5     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/SquallLine_2D/inputs_moisture_Gabersek b/Exec/SquallLine_2D/inputs_moisture_Gabersek
index 235ef5941..8c4ac7e2e 100644
--- a/Exec/SquallLine_2D/inputs_moisture_Gabersek
+++ b/Exec/SquallLine_2D/inputs_moisture_Gabersek
@@ -19,7 +19,6 @@ zlo.type = "SlipWall"
 zhi.type = "HO_Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.125     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/SquallLine_2D/inputs_moisture_SAM b/Exec/SquallLine_2D/inputs_moisture_SAM
index b920bbe5f..3f7ea21b1 100644
--- a/Exec/SquallLine_2D/inputs_moisture_SAM
+++ b/Exec/SquallLine_2D/inputs_moisture_SAM
@@ -19,7 +19,6 @@ zlo.type = "SlipWall"
 zhi.type = "Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.5     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/SquallLine_2D/inputs_moisture_WRF b/Exec/SquallLine_2D/inputs_moisture_WRF
index 677d5380b..b1fea7fdb 100644
--- a/Exec/SquallLine_2D/inputs_moisture_WRF
+++ b/Exec/SquallLine_2D/inputs_moisture_WRF
@@ -19,7 +19,6 @@ zlo.type = "SlipWall"
 zhi.type = "HO_Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.5     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/SuperCell/inputs_moisture b/Exec/SuperCell/inputs_moisture
index b363a84cb..92e812fe7 100644
--- a/Exec/SuperCell/inputs_moisture
+++ b/Exec/SuperCell/inputs_moisture
@@ -18,7 +18,6 @@ zlo.type = "SlipWall"
 zhi.type = "SlipWall"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 1.0      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.25     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Exec/SuperCell_3D/inputs_Supercell_3D b/Exec/SuperCell_3D/inputs_Supercell_3D
index 3f2daff82..55a6955c5 100644
--- a/Exec/SuperCell_3D/inputs_Supercell_3D
+++ b/Exec/SuperCell_3D/inputs_Supercell_3D
@@ -22,7 +22,6 @@ zlo.type = "SlipWall"
 zhi.type = "HO_Outflow"
 
 # TIME STEP CONTROL
-erf.use_native_mri = 1
 erf.fixed_dt       = 0.25      # fixed time step [s] -- Straka et al 1993
 erf.fixed_fast_dt  = 0.125     # fixed time step [s] -- Straka et al 1993
 
diff --git a/Source/BoundaryConditions/ERF_BoundaryConditions_basestate.cpp b/Source/BoundaryConditions/ERF_BoundaryConditions_basestate.cpp
index ede2449c1..e7a47ad52 100644
--- a/Source/BoundaryConditions/ERF_BoundaryConditions_basestate.cpp
+++ b/Source/BoundaryConditions/ERF_BoundaryConditions_basestate.cpp
@@ -13,7 +13,7 @@ using namespace amrex;
 
 void ERFPhysBCFunct_base::impose_lateral_basestate_bcs (const Array4<Real>& dest_arr, const Box& bx, const Box& domain)
 {
-    BL_PROFILE_VAR("impose_lateral_cons_bcs()",impose_lateral_cons_bcs);
+    BL_PROFILE_VAR("impose_lateral_base_bcs()",impose_lateral_base_bcs);
 
     int icomp = 0;
     int ncomp = 3;
diff --git a/Source/BoundaryConditions/ERF_BoundaryConditions_cons.cpp b/Source/BoundaryConditions/ERF_BoundaryConditions_cons.cpp
index b46c5f466..1a2edbd1c 100644
--- a/Source/BoundaryConditions/ERF_BoundaryConditions_cons.cpp
+++ b/Source/BoundaryConditions/ERF_BoundaryConditions_cons.cpp
@@ -15,7 +15,7 @@ using namespace amrex;
  */
 
 void ERFPhysBCFunct_cons::impose_lateral_cons_bcs (const Array4<Real>& dest_arr, const Box& bx, const Box& domain,
-                                                   int icomp, int ncomp, int ngz)
+                                                   int icomp, int ncomp, IntVect ng)
 {
     BL_PROFILE_VAR("impose_lateral_cons_bcs()",impose_lateral_cons_bcs);
     const auto& dom_lo = lbound(domain);
@@ -70,6 +70,7 @@ void ERFPhysBCFunct_cons::impose_lateral_cons_bcs (const Array4<Real>& dest_arr,
     {
         Box bx_xlo(bx);  bx_xlo.setBig  (0,dom_lo.x-1);
         Box bx_xhi(bx);  bx_xhi.setSmall(0,dom_hi.x+1);
+
         ParallelFor(
             bx_xlo, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
             {
@@ -108,6 +109,7 @@ void ERFPhysBCFunct_cons::impose_lateral_cons_bcs (const Array4<Real>& dest_arr,
     {
         Box bx_ylo(bx);  bx_ylo.setBig  (1,dom_lo.y-1);
         Box bx_yhi(bx);  bx_yhi.setSmall(1,dom_hi.y+1);
+
         ParallelFor(
             bx_ylo, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
             {
@@ -147,10 +149,10 @@ void ERFPhysBCFunct_cons::impose_lateral_cons_bcs (const Array4<Real>& dest_arr,
         // Populate ghost cells on lo-x and hi-x domain boundaries
         Box bx_xlo(bx);  bx_xlo.setBig  (0,dom_lo.x-1);
         Box bx_xhi(bx);  bx_xhi.setSmall(0,dom_hi.x+1);
-        if (bx_xlo.smallEnd(2) != domain.smallEnd(2)) bx_xlo.growLo(2,ngz);
-        if (bx_xlo.bigEnd(2)   != domain.bigEnd(2))   bx_xlo.growHi(2,ngz);
-        if (bx_xhi.smallEnd(2) != domain.smallEnd(2)) bx_xhi.growLo(2,ngz);
-        if (bx_xhi.bigEnd(2)   != domain.bigEnd(2))   bx_xhi.growHi(2,ngz);
+        if (bx_xlo.smallEnd(2) != domain.smallEnd(2)) bx_xlo.growLo(2,ng[2]);
+        if (bx_xlo.bigEnd(2)   != domain.bigEnd(2))   bx_xlo.growHi(2,ng[2]);
+        if (bx_xhi.smallEnd(2) != domain.smallEnd(2)) bx_xhi.growLo(2,ng[2]);
+        if (bx_xhi.bigEnd(2)   != domain.bigEnd(2))   bx_xhi.growHi(2,ng[2]);
         ParallelFor(
             bx_xlo, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
             {
@@ -196,10 +198,10 @@ void ERFPhysBCFunct_cons::impose_lateral_cons_bcs (const Array4<Real>& dest_arr,
         // Populate ghost cells on lo-y and hi-y domain boundaries
         Box bx_ylo(bx);  bx_ylo.setBig  (1,dom_lo.y-1);
         Box bx_yhi(bx);  bx_yhi.setSmall(1,dom_hi.y+1);
-        if (bx_ylo.smallEnd(2) != domain.smallEnd(2)) bx_ylo.growLo(2,ngz);
-        if (bx_ylo.bigEnd(2)   != domain.bigEnd(2))   bx_ylo.growHi(2,ngz);
-        if (bx_yhi.smallEnd(2) != domain.smallEnd(2)) bx_yhi.growLo(2,ngz);
-        if (bx_yhi.bigEnd(2)   != domain.bigEnd(2))   bx_yhi.growHi(2,ngz);
+        if (bx_ylo.smallEnd(2) != domain.smallEnd(2)) bx_ylo.growLo(2,ng[2]);
+        if (bx_ylo.bigEnd(2)   != domain.bigEnd(2))   bx_ylo.growHi(2,ng[2]);
+        if (bx_yhi.smallEnd(2) != domain.smallEnd(2)) bx_yhi.growLo(2,ng[2]);
+        if (bx_yhi.bigEnd(2)   != domain.bigEnd(2))   bx_yhi.growHi(2,ng[2]);
         ParallelFor(
             bx_ylo, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n)
             {
@@ -261,7 +263,7 @@ void ERFPhysBCFunct_cons::impose_vertical_cons_bcs (const Array4<Real>& dest_arr
                                                     const GpuArray<Real,AMREX_SPACEDIM> dxInv,
                                                     int icomp, int ncomp)
 {
-    BL_PROFILE_VAR("impose_lateral_cons_bcs()",impose_lateral_cons_bcs);
+    BL_PROFILE_VAR("impose_vertical_cons_bcs()",impose_vertical_cons_bcs);
     const auto& dom_lo = lbound(domain);
     const auto& dom_hi = ubound(domain);
 
diff --git a/Source/BoundaryConditions/ERF_BoundaryConditions_realbdy.cpp b/Source/BoundaryConditions/ERF_BoundaryConditions_realbdy.cpp
index 289349d90..5d6be8b7d 100644
--- a/Source/BoundaryConditions/ERF_BoundaryConditions_realbdy.cpp
+++ b/Source/BoundaryConditions/ERF_BoundaryConditions_realbdy.cpp
@@ -67,6 +67,8 @@ ERF::fill_from_realbdy (const Vector<MultiFab*>& mfs,
     {
         MultiFab& mf = *mfs[var_idx];
 
+        mf.FillBoundary(geom[lev].periodicity());
+
         //
         // Note that "domain" is mapped onto the type of box the data is in
         //
diff --git a/Source/BoundaryConditions/ERF_FillBdyCCVels.cpp b/Source/BoundaryConditions/ERF_FillBdyCCVels.cpp
new file mode 100644
index 000000000..5726d06c0
--- /dev/null
+++ b/Source/BoundaryConditions/ERF_FillBdyCCVels.cpp
@@ -0,0 +1,100 @@
+#include <ERF.H>
+#include <ERF_PhysBCFunct.H>
+#include <ERF_IndexDefines.H>
+#include <ERF_TimeInterpolatedData.H>
+#include <ERF_FillPatcher.H>
+#include <ERF_Utils.H>
+
+using namespace amrex;
+
+void
+ERF::FillBdyCCVels (Vector<MultiFab>& mf_cc_vel)
+{
+    // Impose bc's at domain boundaries
+    for (int lev = 0; lev <= finest_level; ++lev)
+    {
+        Box domain(Geom(lev).Domain());
+
+        int ihi = domain.bigEnd(0);
+        int jhi = domain.bigEnd(1);
+        int khi = domain.bigEnd(2);
+
+        // Impose periodicity first
+        mf_cc_vel[lev].FillBoundary(geom[lev].periodicity());
+
+        for (MFIter mfi(mf_cc_vel[lev], TilingIfNotGPU()); mfi.isValid(); ++mfi)
+        {
+            // Note that we don't fill corners here -- only the cells that share a face
+            //      with interior cells -- this is all that is needed to calculate vorticity
+            const Box& bx = mfi.tilebox();
+            const Array4<Real>& vel_arr = mf_cc_vel[lev].array(mfi);
+
+            if (!Geom(lev).isPeriodic(0)) {
+                // Low-x side
+                if (bx.smallEnd(0) <= domain.smallEnd(0)) {
+                    Real mult = (phys_bc_type[0] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,0,0), [=] AMREX_GPU_DEVICE(int , int j, int k) noexcept
+                    {
+                        vel_arr(-1,j,k,1) = mult*vel_arr(0,j,k,1); // v
+                        vel_arr(-1,j,k,2) = mult*vel_arr(0,j,k,2); // w
+                    });
+                }
+
+                // High-x side
+                if (bx.bigEnd(0) >= domain.bigEnd(0)) {
+                    Real mult = (phys_bc_type[3] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,0,0), [=] AMREX_GPU_DEVICE(int , int j, int k) noexcept
+                    {
+                        vel_arr(ihi+1,j,k,1) = mult*vel_arr(ihi,j,k,1); // v
+                        vel_arr(ihi+1,j,k,2) = mult*vel_arr(ihi,j,k,2); // w
+                    });
+                }
+            } // !periodic
+
+            if (!Geom(lev).isPeriodic(1)) {
+                // Low-y side
+                if (bx.smallEnd(1) <= domain.smallEnd(1)) {
+                    Real mult = (phys_bc_type[1] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,1,0), [=] AMREX_GPU_DEVICE(int i, int  , int k) noexcept
+                    {
+                        vel_arr(i,-1,k,0) = mult*vel_arr(i,0,k,0); // u
+                        vel_arr(i,-1,k,2) = mult*vel_arr(i,0,k,2); // w
+                    });
+                }
+
+                // High-y side
+                if (bx.bigEnd(1) >= domain.bigEnd(1)) {
+                    Real mult = (phys_bc_type[4] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,1,0), [=] AMREX_GPU_DEVICE(int i, int , int k) noexcept
+                    {
+                        vel_arr(i,jhi+1,k,0) = mult*vel_arr(i,jhi,k,0); // u
+                        vel_arr(i,jhi+1,k,2) = mult*-vel_arr(i,jhi,k,2); // w
+                    });
+                }
+            } // !periodic
+
+            if (!Geom(lev).isPeriodic(2)) {
+                // Low-z side
+                if (bx.smallEnd(2) <= domain.smallEnd(2)) {
+                    Real mult = (phys_bc_type[2] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,2,0), [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+                    {
+                        vel_arr(i,j,-1,0) = mult*vel_arr(i,j,0,0); // u
+                        vel_arr(i,j,-1,1) = mult*vel_arr(i,j,0,1); // v
+                    });
+                }
+
+                // High-z side
+                if (bx.bigEnd(2) >= domain.bigEnd(2)) {
+                    Real mult = (phys_bc_type[5] == ERF_BC::no_slip_wall) ? -1. : 1.;
+                    ParallelFor(makeSlab(bx,2,0), [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+                    {
+                        vel_arr(i,j,khi+1,0) = mult*vel_arr(i,j,khi,0); // u
+                        vel_arr(i,j,khi+1,1) = mult*vel_arr(i,j,khi,1); // v
+                    });
+                }
+            } // !periodic
+        } // MFIter
+
+    } // lev
+}
diff --git a/Source/BoundaryConditions/ERF_FillCoarsePatch.cpp b/Source/BoundaryConditions/ERF_FillCoarsePatch.cpp
new file mode 100644
index 000000000..9815d050a
--- /dev/null
+++ b/Source/BoundaryConditions/ERF_FillCoarsePatch.cpp
@@ -0,0 +1,132 @@
+#include <ERF.H>
+#include <ERF_PhysBCFunct.H>
+#include <ERF_IndexDefines.H>
+#include <ERF_TimeInterpolatedData.H>
+#include <ERF_FillPatcher.H>
+#include <ERF_Utils.H>
+
+using namespace amrex;
+
+/*
+ * Fill valid and ghost data.
+ * This version fills an entire MultiFab by interpolating from the coarser level -- this is used
+ * only when a new level of refinement is being created during a run (i.e not at initialization)
+ * This will never be used with static refinement.
+ *
+ * @param[in]  lev            level of refinement at which to fill the data
+ * @param[in]  time           time at which the data should be filled
+ * @param[out] mfs            Vector of MultiFabs to be filled containing, in order: cons, xvel, yvel, and zvel data
+ */
+void
+ERF::FillCoarsePatch (int lev, Real time)
+{
+    BL_PROFILE_VAR("FillCoarsePatch()",FillCoarsePatch);
+    AMREX_ASSERT(lev > 0);
+
+    //
+    //****************************************************************************************************************
+    // First fill velocities and density at the COARSE level so we can convert velocity to momenta at the COARSE level
+    //****************************************************************************************************************
+    //
+    bool cons_only = false;
+    FillPatch(lev-1, time, {&vars_new[lev-1][Vars::cons], &vars_new[lev-1][Vars::xvel],
+                            &vars_new[lev-1][Vars::yvel], &vars_new[lev-1][Vars::zvel]},
+                           {&vars_new[lev-1][Vars::cons],
+                            &rU_new[lev-1], &rV_new[lev-1], &rW_new[lev-1]},
+                            false, cons_only);
+
+    //
+    // ************************************************
+    // Convert velocity to momentum at the COARSE level
+    // ************************************************
+    //
+    VelocityToMomentum(vars_new[lev-1][Vars::xvel], IntVect{0},
+                       vars_new[lev-1][Vars::yvel], IntVect{0},
+                       vars_new[lev-1][Vars::zvel], IntVect{0},
+                       vars_new[lev-1][Vars::cons],
+                         rU_new[lev-1],
+                         rV_new[lev-1],
+                         rW_new[lev-1],
+                       Geom(lev).Domain(),
+                       domain_bcs_type);
+    //
+    // *****************************************************************
+    // Interpolate all cell-centered variables from coarse to fine level
+    // *****************************************************************
+    //
+    Interpolater* mapper_c = &cell_cons_interp;
+    Interpolater* mapper_f = &face_cons_linear_interp;
+
+    //
+    //************************************************************************************************
+    // Interpolate cell-centered data from coarse to fine level
+    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
+    // ************************************************************************************************
+    IntVect ngvect_cons = vars_new[lev][Vars::cons].nGrowVect();
+    int      ncomp_cons = vars_new[lev][Vars::cons].nComp();
+
+    InterpFromCoarseLevel(vars_new[lev  ][Vars::cons], ngvect_cons, IntVect(0,0,0),
+                          vars_new[lev-1][Vars::cons], 0, 0, ncomp_cons,
+                          geom[lev-1], geom[lev],
+                          refRatio(lev-1), mapper_c, domain_bcs_type, BCVars::cons_bc);
+
+    //
+    //************************************************************************************************
+    // Interpolate x-momentum from coarse to fine level
+    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
+    // ************************************************************************************************
+    //
+    InterpFromCoarseLevel(rU_new[lev], IntVect{0}, IntVect{0}, rU_new[lev-1], 0, 0, 1,
+                          geom[lev-1], geom[lev],
+                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::xvel_bc);
+
+    //
+    //************************************************************************************************
+    // Interpolate y-momentum from coarse to fine level
+    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
+    // ************************************************************************************************
+    //
+    InterpFromCoarseLevel(rV_new[lev], IntVect{0}, IntVect{0}, rV_new[lev-1], 0, 0, 1,
+                          geom[lev-1], geom[lev],
+                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::yvel_bc);
+
+    //************************************************************************************************
+    // Interpolate z-momentum from coarse to fine level
+    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
+    // ************************************************************************************************
+    InterpFromCoarseLevel(rW_new[lev],  IntVect{0}, IntVect{0}, rW_new[lev-1], 0, 0, 1,
+                          geom[lev-1], geom[lev],
+                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::zvel_bc);
+    //
+    // *********************************************************
+    // After interpolation of momentum, convert back to velocity
+    // *********************************************************
+    //
+    for (int which_lev = lev-1; which_lev <= lev; which_lev++)
+    {
+        MomentumToVelocity(vars_new[which_lev][Vars::xvel],
+                           vars_new[which_lev][Vars::yvel],
+                           vars_new[which_lev][Vars::zvel],
+                           vars_new[which_lev][Vars::cons],
+                             rU_new[which_lev],
+                             rV_new[which_lev],
+                             rW_new[which_lev],
+                           Geom(lev).Domain(),
+                           domain_bcs_type);
+    }
+
+    // ***************************************************************************
+    // Physical bc's at domain boundary
+    // ***************************************************************************
+    IntVect ngvect_vels = vars_new[lev][Vars::xvel].nGrowVect();
+
+    (*physbcs_cons[lev])(vars_new[lev][Vars::cons],0,ncomp_cons,ngvect_cons,time,BCVars::cons_bc,true);
+    (   *physbcs_u[lev])(vars_new[lev][Vars::xvel],0,1         ,ngvect_vels,time,BCVars::xvel_bc,true);
+    (   *physbcs_v[lev])(vars_new[lev][Vars::yvel],0,1         ,ngvect_vels,time,BCVars::yvel_bc,true);
+    (   *physbcs_w[lev])(vars_new[lev][Vars::zvel],vars_new[lev][Vars::xvel],vars_new[lev][Vars::yvel],
+                         ngvect_vels,time,BCVars::zvel_bc,true);
+
+    // ***************************************************************************
+    // Since lev > 0 here we don't worry about m_r2d or wrfbdy data
+    // ***************************************************************************
+}
diff --git a/Source/BoundaryConditions/ERF_FillIntermediatePatch.cpp b/Source/BoundaryConditions/ERF_FillIntermediatePatch.cpp
new file mode 100644
index 000000000..99be97002
--- /dev/null
+++ b/Source/BoundaryConditions/ERF_FillIntermediatePatch.cpp
@@ -0,0 +1,261 @@
+#include <ERF.H>
+#include <ERF_PhysBCFunct.H>
+#include <ERF_IndexDefines.H>
+#include <ERF_TimeInterpolatedData.H>
+#include <ERF_FillPatcher.H>
+#include <ERF_Utils.H>
+
+using namespace amrex;
+
+/*
+ * Fill valid and ghost data
+ * This version fills mfs in valid regions with the values in "mfs" when it is passed in;
+ * it is used only to compute ghost values for intermediate stages of a time integrator.
+ *
+ * @param[in]  lev            level of refinement at which to fill the data
+ * @param[in]  time           time at which the data should be filled
+ * @param[out] mfs_vel        Vector of MultiFabs to be filled containing, in order: cons, xvel, yvel, and zvel
+ * @param[out] mfs_mom        Vector of MultiFabs to be filled containing, in order: cons, xmom, ymom, and zmom
+ * @param[in]  ng_cons        number of ghost cells to be filled for conserved (cell-centered) variables
+ * @param[in]  ng_vel         number of ghost cells to be filled for velocity components
+ * @param[in]  cons_only      if 1 then only fill conserved variables
+ * @param[in]  icomp_cons     starting component for conserved variables
+ * @param[in]  ncomp_cons     number of components for conserved variables
+ * @param[in]  eddyDiffs      diffusion coefficients for LES turbulence models
+ * @param[in]  allow_most_bcs if true then use MOST bcs at the low boundary
+ */
+void
+ERF::FillIntermediatePatch (int lev, Real time,
+                            const Vector<MultiFab*>& mfs_vel,     // This includes cc quantities and VELOCITIES
+                            const Vector<MultiFab*>& mfs_mom,     // This includes cc quantities and MOMENTA
+                            int ng_cons, int ng_vel, bool cons_only,
+                            int icomp_cons, int ncomp_cons,
+                            bool allow_most_bcs)
+{
+    BL_PROFILE_VAR("FillIntermediatePatch()",FillIntermediatePatch);
+    Interpolater* mapper;
+
+    PhysBCFunctNoOp null_bc;
+
+    //
+    // ***************************************************************************
+    // The first thing we do is interpolate the momenta on the "valid" faces of
+    // the fine grids (where the interface is coarse/fine not fine/fine) -- this
+    // will not be over-written by interpolation below because the FillPatch
+    // operators see these as valid faces.  But we must have these interpolated
+    // values in the fine data before we call FillPatchTwoLevels.
+    //
+    // Also -- note that we might be filling values by interpolation at physical boundaries
+    //         here but that's ok because we will overwrite those values when we impose
+    //         the physical bc's below
+    // ***************************************************************************
+    if (lev>0) {
+        if (cf_set_width > 0) {
+            // We note that mfs_vel[Vars::cons] and mfs_mom[Vars::cons] are in fact the same pointer
+            FPr_c[lev-1].FillSet(*mfs_vel[Vars::cons], time, null_bc, domain_bcs_type);
+        }
+        if ( !cons_only && (cf_set_width >= 0) ) {
+            FPr_u[lev-1].FillSet(*mfs_mom[IntVars::xmom], time, null_bc, domain_bcs_type);
+            FPr_v[lev-1].FillSet(*mfs_mom[IntVars::ymom], time, null_bc, domain_bcs_type);
+            FPr_w[lev-1].FillSet(*mfs_mom[IntVars::zmom], time, null_bc, domain_bcs_type);
+        }
+    }
+
+    AMREX_ALWAYS_ASSERT(mfs_mom.size() == IntVars::NumTypes);
+    AMREX_ALWAYS_ASSERT(mfs_vel.size() == Vars::NumTypes);
+
+    // Enforce no penetration for thin immersed body
+    if (xflux_imask[lev]) {
+        ApplyMask(*mfs_mom[IntVars::xmom], *xflux_imask[lev]);
+    }
+    if (yflux_imask[lev]) {
+        ApplyMask(*mfs_mom[IntVars::ymom], *yflux_imask[lev]);
+    }
+    if (zflux_imask[lev]) {
+        ApplyMask(*mfs_mom[IntVars::zmom], *zflux_imask[lev]);
+    }
+
+    // We always come in to this call with updated momenta but we need to create updated velocity
+    //    in order to impose the rest of the bc's
+    if (!cons_only) {
+        // This only fills VALID region of velocity
+        MomentumToVelocity(*mfs_vel[Vars::xvel], *mfs_vel[Vars::yvel], *mfs_vel[Vars::zvel],
+                           *mfs_vel[Vars::cons],
+                           *mfs_mom[IntVars::xmom], *mfs_mom[IntVars::ymom], *mfs_mom[IntVars::zmom],
+                            Geom(lev).Domain(), domain_bcs_type);
+    }
+
+    //
+    // We now start working on conserved quantities + VELOCITY
+    //
+    if (lev == 0)
+    {
+        // We don't do anything here because we will call the physbcs routines below,
+        // which calls FillBoundary and fills other domain boundary conditions
+        // Physical boundaries will be filled below
+    }
+    else
+    {
+        MultiFab& mf = *mfs_vel[Vars::cons];
+
+        Vector<MultiFab*> fmf = {&mf,&mf};
+        Vector<MultiFab*> cmf = {&vars_old[lev-1][Vars::cons], &vars_new[lev-1][Vars::cons]};
+        Vector<Real> ctime    = {t_old[lev-1], t_new[lev-1]};
+        Vector<Real> ftime    = {time,time};
+
+        // Impose physical bc's on coarse data (note time and 0 are not used)
+        (*physbcs_cons[lev-1])(vars_old[lev-1][Vars::cons],0,ncomp_cons,IntVect{ng_cons},time,BCVars::cons_bc,true);
+        (*physbcs_cons[lev-1])(vars_new[lev-1][Vars::cons],0,ncomp_cons,IntVect{ng_cons},time,BCVars::cons_bc,true);
+
+        // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
+        mapper = &cell_cons_interp;
+        FillPatchTwoLevels(mf, IntVect{ng_cons}, IntVect(0,0,0),
+                           time, cmf, ctime, fmf, ftime,
+                           icomp_cons, icomp_cons, ncomp_cons, geom[lev-1], geom[lev],
+                           refRatio(lev-1), mapper, domain_bcs_type,
+                           icomp_cons);
+
+        // *****************************************************************************************
+
+        if (!cons_only)
+        {
+            mapper = &face_cons_linear_interp;
+
+            //
+            // NOTE: All interpolation here happens on velocities not momenta;
+            //       note we only do the interpolation and FillBoundary here,
+            //       physical bc's are imposed later
+            //
+            // NOTE: This will only fill velocity from coarse grid *outside* the fine grids
+            //       unlike the FillSet calls above which filled momenta on the coarse/fine bdy
+            //
+
+            MultiFab& mfu = *mfs_vel[Vars::xvel];
+
+            fmf = {&mfu,&mfu};
+            cmf = {&vars_old[lev-1][Vars::xvel], &vars_new[lev-1][Vars::xvel]};
+
+            // Impose physical bc's on coarse data (note time and 0 are not used)
+            (*physbcs_u[lev-1])(vars_old[lev-1][Vars::xvel],0,1,IntVect{ng_vel},time,BCVars::xvel_bc,true);
+            (*physbcs_u[lev-1])(vars_new[lev-1][Vars::xvel],0,1,IntVect{ng_vel},time,BCVars::xvel_bc,true);
+
+            // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
+            FillPatchTwoLevels(mfu, IntVect{ng_vel}, IntVect(0,0,0),
+                               time, cmf, ctime, fmf, ftime,
+                               0, 0, 1, geom[lev-1], geom[lev],
+                               refRatio(lev-1), mapper, domain_bcs_type,
+                               BCVars::xvel_bc);
+
+            // *****************************************************************************************
+
+            MultiFab& mfv = *mfs_vel[Vars::yvel];
+
+            fmf = {&mfv,&mfv};
+            cmf = {&vars_old[lev-1][Vars::yvel], &vars_new[lev-1][Vars::yvel]};
+
+            // Impose physical bc's on coarse data (note time and 0 are not used)
+            (*physbcs_v[lev-1])(vars_old[lev-1][Vars::yvel],0,1,IntVect{ng_vel},time,BCVars::yvel_bc,true);
+            (*physbcs_v[lev-1])(vars_new[lev-1][Vars::yvel],0,1,IntVect{ng_vel},time,BCVars::yvel_bc,true);
+
+            // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
+            FillPatchTwoLevels(mfv, IntVect{ng_vel}, IntVect(0,0,0),
+                               time, cmf, ctime, fmf, ftime,
+                               0, 0, 1, geom[lev-1], geom[lev],
+                               refRatio(lev-1), mapper, domain_bcs_type,
+                               BCVars::yvel_bc);
+
+            // *****************************************************************************************
+
+            MultiFab& mfw = *mfs_vel[Vars::zvel];
+
+            fmf = {&mfw,&mfw};
+            cmf = {&vars_old[lev-1][Vars::zvel], &vars_new[lev-1][Vars::zvel]};
+
+            // Impose physical bc's on coarse data (note time and 0 are not used)
+            (*physbcs_w[lev-1])(vars_old[lev-1][Vars::zvel],
+                                vars_old[lev-1][Vars::xvel],
+                                vars_old[lev-1][Vars::yvel],
+                                IntVect{ng_vel},time,BCVars::zvel_bc,true);
+            (*physbcs_w[lev-1])(vars_new[lev-1][Vars::zvel],
+                                vars_new[lev-1][Vars::xvel],
+                                vars_new[lev-1][Vars::yvel],
+                                IntVect{ng_vel},time,BCVars::zvel_bc,true);
+
+            // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
+            FillPatchTwoLevels(mfw, IntVect{ng_vel}, IntVect(0,0,0),
+                               time, cmf, ctime, fmf, ftime,
+                               0, 0, 1, geom[lev-1], geom[lev],
+                               refRatio(lev-1), mapper, domain_bcs_type,
+                               BCVars::zvel_bc);
+        } // !cons_only
+    } // lev > 0
+
+    // ***************************************************************************
+    // Physical bc's at domain boundary
+    // ***************************************************************************
+    IntVect ngvect_cons = IntVect(ng_cons,ng_cons,ng_cons);
+    IntVect ngvect_vels = IntVect(ng_vel ,ng_vel ,ng_vel);
+
+    bool do_fb = true;
+
+#ifdef ERF_USE_NETCDF
+    // We call this here because it is an ERF routine
+    if (use_real_bcs && (lev==0)) {
+        fill_from_realbdy(mfs_vel,time,cons_only,icomp_cons,ncomp_cons,ngvect_cons, ngvect_vels);
+        do_fb = false;
+    }
+#endif
+
+    if (m_r2d) fill_from_bndryregs(mfs_vel,time);
+
+    // We call this even if init_type == InitType::Real because this routine will fill the vertical bcs
+    (*physbcs_cons[lev])(*mfs_vel[Vars::cons],icomp_cons,ncomp_cons,ngvect_cons,time,BCVars::cons_bc, do_fb);
+    if (!cons_only) {
+        (*physbcs_u[lev])(*mfs_vel[Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc, do_fb);
+        (*physbcs_v[lev])(*mfs_vel[Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc, do_fb);
+        (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
+                          ngvect_vels,time,BCVars::zvel_bc, do_fb);
+    }
+    // ***************************************************************************
+
+    // MOST boundary conditions
+    if (!(cons_only && ncomp_cons == 1) && m_most && allow_most_bcs) {
+        m_most->impose_most_bcs(lev,mfs_vel,
+                                Tau11_lev[lev].get(),
+                                Tau22_lev[lev].get(),
+                                Tau33_lev[lev].get(),
+                                Tau12_lev[lev].get(), Tau21_lev[lev].get(),
+                                Tau13_lev[lev].get(), Tau31_lev[lev].get(),
+                                Tau23_lev[lev].get(), Tau32_lev[lev].get(),
+                                SFS_hfx1_lev[lev].get(),
+                                SFS_hfx2_lev[lev].get(),
+                                SFS_hfx3_lev[lev].get(),
+                                SFS_q1fx1_lev[lev].get(),
+                                SFS_q1fx2_lev[lev].get(),
+                                SFS_q1fx3_lev[lev].get(),
+                                z_phys_nd[lev].get());
+    }
+
+    // We always come in to this call with momenta so we need to leave with momenta!
+    // We need to make sure we convert back on all ghost cells/faces because this is
+    // how velocity from fine-fine copies (as well as physical and interpolated bcs) will be filled
+    if (!cons_only)
+    {
+        IntVect ngu = (!solverChoice.use_NumDiff) ? IntVect(1,1,1) : mfs_vel[Vars::xvel]->nGrowVect();
+        IntVect ngv = (!solverChoice.use_NumDiff) ? IntVect(1,1,1) : mfs_vel[Vars::yvel]->nGrowVect();
+        IntVect ngw = (!solverChoice.use_NumDiff) ? IntVect(1,1,0) : mfs_vel[Vars::zvel]->nGrowVect();
+
+        VelocityToMomentum(*mfs_vel[Vars::xvel], ngu,
+                           *mfs_vel[Vars::yvel], ngv,
+                           *mfs_vel[Vars::zvel], ngw,
+                           *mfs_vel[Vars::cons],
+                           *mfs_mom[IntVars::xmom], *mfs_mom[IntVars::ymom], *mfs_mom[IntVars::zmom],
+                           Geom(lev).Domain(),
+                           domain_bcs_type);
+    }
+
+    mfs_mom[IntVars::cons]->FillBoundary(geom[lev].periodicity());
+    mfs_mom[IntVars::xmom]->FillBoundary(geom[lev].periodicity());
+    mfs_mom[IntVars::ymom]->FillBoundary(geom[lev].periodicity());
+    mfs_mom[IntVars::zmom]->FillBoundary(geom[lev].periodicity());
+}
diff --git a/Source/BoundaryConditions/ERF_FillPatch.cpp b/Source/BoundaryConditions/ERF_FillPatch.cpp
index 4a98a01ba..03f272b55 100644
--- a/Source/BoundaryConditions/ERF_FillPatch.cpp
+++ b/Source/BoundaryConditions/ERF_FillPatch.cpp
@@ -7,8 +7,6 @@
 
 using namespace amrex;
 
-PhysBCFunctNoOp null_bc;
-
 /*
  * Fill valid and ghost data with the "state data" at the given time
  * NOTE: THIS OPERATES ON VELOCITY (MOMENTA ARE JUST TEMPORARIES)
@@ -27,6 +25,8 @@ ERF::FillPatch (int lev, Real time,
     BL_PROFILE_VAR("ERF::FillPatch()",ERF_FillPatch);
     Interpolater* mapper = nullptr;
 
+    PhysBCFunctNoOp null_bc;
+
     //
     // ***************************************************************************
     // The first thing we do is interpolate the momenta on the "valid" faces of
@@ -43,20 +43,9 @@ ERF::FillPatch (int lev, Real time,
             FPr_c[lev-1].FillSet(*mfs_vel[Vars::cons], time, null_bc, domain_bcs_type);
         }
         if (cf_set_width >= 0 && !cons_only) {
-            //
-            // This is an optimization since we won't need more than one ghost
-            // cell of momentum in the integrator if not using NumDiff
-            //
-            //IntVect ngu = (solverChoice.use_NumDiff) ? IntVect(1,1,1) : mfs_vel[Vars::xvel]->nGrowVect();
-            //IntVect ngv = (solverChoice.use_NumDiff) ? IntVect(1,1,1) : mfs_vel[Vars::yvel]->nGrowVect();
-            //IntVect ngw = (solverChoice.use_NumDiff) ? IntVect(1,1,0) : mfs_vel[Vars::zvel]->nGrowVect();
-            IntVect ngu = IntVect::TheZeroVector();
-            IntVect ngv = IntVect::TheZeroVector();
-            IntVect ngw = IntVect::TheZeroVector();
-
-            VelocityToMomentum(*mfs_vel[Vars::xvel], ngu,
-                               *mfs_vel[Vars::yvel], ngv,
-                               *mfs_vel[Vars::zvel], ngw,
+            VelocityToMomentum(*mfs_vel[Vars::xvel], IntVect{0},
+                               *mfs_vel[Vars::yvel], IntVect{0},
+                               *mfs_vel[Vars::zvel], IntVect{0},
                                *mfs_vel[Vars::cons],
                                *mfs_mom[IntVars::xmom],
                                *mfs_mom[IntVars::ymom],
@@ -95,24 +84,18 @@ ERF::FillPatch (int lev, Real time,
         FillPatchSingleLevel(*mfs_vel[Vars::cons], ngvect_cons, time, fmf, IntVect(0,0,0), ftime,
                              0, 0, ncomp, geom[lev]);
 
-        (*physbcs_cons[lev])(*mfs_vel[Vars::cons],0,ncomp,ngvect_cons,time,BCVars::cons_bc);
-
         if (!cons_only) {
             fmf = {&vars_old[lev][Vars::xvel], &vars_new[lev][Vars::xvel]};
             FillPatchSingleLevel(*mfs_vel[Vars::xvel], ngvect_vels, time, fmf,
                                  IntVect(0,0,0), ftime,  0, 0, 1, geom[lev]);
-            (*physbcs_u[lev])(*mfs_vel[Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
 
             fmf = {&vars_old[lev][Vars::yvel], &vars_new[lev][Vars::yvel]};
             FillPatchSingleLevel(*mfs_vel[Vars::yvel], ngvect_vels, time, fmf,
                                  IntVect(0,0,0), ftime,  0, 0, 1, geom[lev]);
-            (*physbcs_v[lev])(*mfs_vel[Vars::yvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
 
             fmf = {&vars_old[lev][Vars::zvel], &vars_new[lev][Vars::zvel]};
             FillPatchSingleLevel(*mfs_vel[Vars::zvel], ngvect_vels, time, fmf,
                                  IntVect(0,0,0), ftime,  0, 0, 1, geom[lev]);
-            (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
-                              ngvect_vels,time,BCVars::zvel_bc);
         } // !cons_only
 
     } else {
@@ -126,12 +109,9 @@ ERF::FillPatch (int lev, Real time,
         mapper = &cell_cons_interp;
 
         // Impose physical bc's on coarse data (note time and 0 are not used)
-        (*physbcs_cons[lev-1])(vars_old[lev-1][Vars::cons],0,mf_c.nComp(),ngvect_cons,time,BCVars::cons_bc);
-        (*physbcs_cons[lev-1])(vars_new[lev-1][Vars::cons],0,mf_c.nComp(),ngvect_cons,time,BCVars::cons_bc);
-
-        // Make sure internal ghost cells are filled as well
-        vars_old[lev-1][Vars::cons].FillBoundary(geom[lev-1].periodicity());
-        vars_new[lev-1][Vars::cons].FillBoundary(geom[lev-1].periodicity());
+        // Note that we call FillBoundary inside the physbcs call
+        // We should not need to call this on old data since that would have been filled before the timestep started
+        (*physbcs_cons[lev-1])(vars_new[lev-1][Vars::cons],0,mf_c.nComp(),ngvect_cons,time,BCVars::cons_bc,true);
 
         // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
         FillPatchTwoLevels(mf_c, ngvect_cons, IntVect(0,0,0),
@@ -140,9 +120,6 @@ ERF::FillPatch (int lev, Real time,
                            refRatio(lev-1), mapper, domain_bcs_type,
                            BCVars::cons_bc);
 
-        // Impose physical bc's on fine data
-        (*physbcs_cons[lev])(mf_c,0,mf_c.nComp(),ngvect_cons,time,BCVars::cons_bc);
-
         if (!cons_only)
         {
             mapper = &face_cons_linear_interp;
@@ -153,13 +130,12 @@ ERF::FillPatch (int lev, Real time,
 
             // **********************************************************************
 
-            // Impose physical bc's on coarse data (note time and 0 are not used)
-            (*physbcs_u[lev-1])(vars_old[lev-1][Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
-            (*physbcs_u[lev-1])(vars_new[lev-1][Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
+            cmf = {&vars_old[lev-1][Vars::xvel], &vars_new[lev-1][Vars::xvel]};
 
-            // Make sure internal ghost cells are filled as well
-            vars_old[lev-1][Vars::xvel].FillBoundary(geom[lev-1].periodicity());
-            vars_new[lev-1][Vars::xvel].FillBoundary(geom[lev-1].periodicity());
+            // Impose physical bc's on coarse data (note time and 0 are not used)
+            // Note that we call FillBoundary inside the physbcs call
+            // We should not need to call this on old data since that would have been filled before the timestep started
+            (*physbcs_u[lev-1])(vars_new[lev-1][Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc,true);
 
             fmf = {&vars_old[lev  ][Vars::xvel], &vars_new[lev  ][Vars::xvel]};
             cmf = {&vars_old[lev-1][Vars::xvel], &vars_new[lev-1][Vars::xvel]};
@@ -171,18 +147,14 @@ ERF::FillPatch (int lev, Real time,
                                refRatio(lev-1), mapper, domain_bcs_type,
                                BCVars::xvel_bc);
 
-            // Impose physical bc's on fine data
-            (*physbcs_u[lev])(vars_new[lev][Vars::xvel],0,mf_u.nComp(),ngvect_vels,time,BCVars::xvel_bc);
-
             // **********************************************************************
 
-            // Impose physical bc's on coarse data (note time and 0 are not used)
-            (*physbcs_v[lev-1])(vars_old[lev-1][Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc);
-            (*physbcs_v[lev-1])(vars_new[lev-1][Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc);
+            cmf = {&vars_old[lev-1][Vars::yvel], &vars_new[lev-1][Vars::yvel]};
 
-            // Make sure internal ghost cells are filled as well
-            vars_old[lev-1][Vars::yvel].FillBoundary(geom[lev-1].periodicity());
-            vars_new[lev-1][Vars::yvel].FillBoundary(geom[lev-1].periodicity());
+            // Impose physical bc's on coarse data (note time and 0 are not used)
+            // Note that we call FillBoundary inside the physbcs call
+            // We should not need to call this on old data since that would have been filled before the timestep started
+            (*physbcs_v[lev-1])(vars_new[lev-1][Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc,true);
 
             fmf = {&vars_old[lev  ][Vars::yvel], &vars_new[lev  ][Vars::yvel]};
             cmf = {&vars_old[lev-1][Vars::yvel], &vars_new[lev-1][Vars::yvel]};
@@ -194,24 +166,17 @@ ERF::FillPatch (int lev, Real time,
                                refRatio(lev-1), mapper, domain_bcs_type,
                                BCVars::yvel_bc);
 
-            // Impose physical bc's on fine data
-            (*physbcs_v[lev])(vars_new[lev][Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc);
-
             // **********************************************************************
 
+            cmf = {&vars_old[lev-1][Vars::zvel], &vars_new[lev-1][Vars::zvel]};
+
             // Impose physical bc's on coarse data (note time and 0 are not used)
-            (*physbcs_w[lev-1])(vars_old[lev-1][Vars::zvel],
-                                vars_old[lev-1][Vars::xvel],
-                                vars_old[lev-1][Vars::yvel],
-                                ngvect_vels,time,BCVars::zvel_bc);
+            // Note that we call FillBoundary inside the physbcs call
+            // We should not need to call this on old data since that would have been filled before the timestep started
             (*physbcs_w[lev-1])(vars_new[lev-1][Vars::zvel],
                                 vars_new[lev-1][Vars::xvel],
                                 vars_new[lev-1][Vars::yvel],
-                                ngvect_vels,time,BCVars::zvel_bc);
-
-            // Make sure internal ghost cells are filled as well
-            vars_old[lev-1][Vars::zvel].FillBoundary(geom[lev-1].periodicity());
-            vars_new[lev-1][Vars::zvel].FillBoundary(geom[lev-1].periodicity());
+                                ngvect_vels,time,BCVars::zvel_bc,true);
 
             fmf = {&vars_old[lev  ][Vars::zvel], &vars_new[lev  ][Vars::zvel]};
             cmf = {&vars_old[lev-1][Vars::zvel], &vars_new[lev-1][Vars::zvel]};
@@ -222,12 +187,6 @@ ERF::FillPatch (int lev, Real time,
                                0, 0, 1, geom[lev-1], geom[lev],
                                refRatio(lev-1), mapper, domain_bcs_type,
                                BCVars::zvel_bc);
-
-
-            // Impose physical bc's on fine data -- note the u and v have been filled above
-            (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
-                              ngvect_vels,time,BCVars::zvel_bc);
-
         } // !cons_only
     } // lev > 0
 
@@ -237,557 +196,25 @@ ERF::FillPatch (int lev, Real time,
     int icomp_cons = 0;
     int ncomp_cons = mfs_vel[Vars::cons]->nComp();
 
-#ifdef ERF_USE_NETCDF
-    // We call this here because it is an ERF routine
-    if (use_real_bcs && (lev==0)) {
-        fill_from_realbdy(mfs_vel,time,cons_only,icomp_cons,ncomp_cons,ngvect_cons,ngvect_vels);
-    }
-#endif
-
-    if (m_r2d) fill_from_bndryregs(mfs_vel,time);
-
-    // We call these even if init_type == real because these will fill the vertical bcs
-    (*physbcs_cons[lev])(*mfs_vel[Vars::cons],icomp_cons,ncomp_cons,ngvect_cons,time,BCVars::cons_bc);
-    if (!cons_only) {
-        (*physbcs_u[lev])(*mfs_vel[Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
-        (*physbcs_v[lev])(*mfs_vel[Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc);
-        (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
-                          ngvect_vels,time,BCVars::zvel_bc);
-    }
-}
-
-/*
- * Fill ghost cells of qmoist
- *
- * @param[in] lev  level of refinement at which to fill the data
- * @param[in] time time at which the data should be filled
- * @param[out] mf  MultiFab to be filled (qmoist[lev])
- */
-void
-ERF::FillPatchMoistVars (int lev, MultiFab& mf)
-{
-    BL_PROFILE_VAR("ERF::FillPatchMoistVars()",ERF_FillPatchMoistVars);
-    // ***************************************************************************
-    // Physical bc's at domain boundary
-    // ***************************************************************************
-    int icomp_cons = 0;
-    int ncomp_cons = 1; // We only fill qv, the first component
-
-    // Note that we are filling qv, stored in qmoist[lev], with the input data (if there is any), stored
-    // in RhoQ1_comp.
-
-    if (!use_real_bcs) {
-        Real time = Real(0.0);
-        IntVect ngvect_cons = mf.nGrowVect();
-        int bccomp_cons = BCVars::RhoQ1_bc_comp;
-
-        (*physbcs_cons[lev])(mf,icomp_cons,ncomp_cons,ngvect_cons,time,bccomp_cons);
-    }
-
-    mf.FillBoundary(geom[lev].periodicity());
-}
-
-/*
- * Fill valid and ghost data
- * This version fills mfs in valid regions with the values in "mfs" when it is passed in;
- * it is used only to compute ghost values for intermediate stages of a time integrator.
- *
- * @param[in]  lev            level of refinement at which to fill the data
- * @param[in]  time           time at which the data should be filled
- * @param[out] mfs_vel        Vector of MultiFabs to be filled containing, in order: cons, xvel, yvel, and zvel
- * @param[out] mfs_mom        Vector of MultiFabs to be filled containing, in order: cons, xmom, ymom, and zmom
- * @param[in]  ng_cons        number of ghost cells to be filled for conserved (cell-centered) variables
- * @param[in]  ng_vel         number of ghost cells to be filled for velocity components
- * @param[in]  cons_only      if 1 then only fill conserved variables
- * @param[in]  icomp_cons     starting component for conserved variables
- * @param[in]  ncomp_cons     number of components for conserved variables
- * @param[in]  eddyDiffs      diffusion coefficients for LES turbulence models
- * @param[in]  allow_most_bcs if true then use MOST bcs at the low boundary
- */
-void
-ERF::FillIntermediatePatch (int lev, Real time,
-                            const Vector<MultiFab*>& mfs_vel,     // This includes cc quantities and VELOCITIES
-                            const Vector<MultiFab*>& mfs_mom,     // This includes cc quantities and MOMENTA
-                            int ng_cons, int ng_vel, bool cons_only,
-                            int icomp_cons, int ncomp_cons,
-                            bool allow_most_bcs)
-{
-    BL_PROFILE_VAR("FillIntermediatePatch()",FillIntermediatePatch);
-    int bccomp;
-    Interpolater* mapper;
-
-    //
-    // ***************************************************************************
-    // The first thing we do is interpolate the momenta on the "valid" faces of
-    // the fine grids (where the interface is coarse/fine not fine/fine) -- this
-    // will not be over-written by interpolation below because the FillPatch
-    // operators see these as valid faces.  But we must have these interpolated
-    // values in the fine data before we call FillPatchTwoLevels.
-    //
-    // Also -- note that we might be filling values by interpolation at physical boundaries
-    //         here but that's ok because we will overwrite those values when we impose
-    //         the physical bc's below
-    // ***************************************************************************
-    if (lev>0) {
-        if (cf_set_width > 0) {
-            // We note that mfs_vel[Vars::cons] and mfs_mom[Vars::cons] are in fact the same pointer
-            FPr_c[lev-1].FillSet(*mfs_vel[Vars::cons], time, null_bc, domain_bcs_type);
-        }
-        if ( !cons_only && (cf_set_width >= 0) ) {
-            FPr_u[lev-1].FillSet(*mfs_mom[IntVars::xmom], time, null_bc, domain_bcs_type);
-            FPr_v[lev-1].FillSet(*mfs_mom[IntVars::ymom], time, null_bc, domain_bcs_type);
-            FPr_w[lev-1].FillSet(*mfs_mom[IntVars::zmom], time, null_bc, domain_bcs_type);
-        }
-    }
-
-    AMREX_ALWAYS_ASSERT(mfs_mom.size() == IntVars::NumTypes);
-    AMREX_ALWAYS_ASSERT(mfs_vel.size() == Vars::NumTypes);
-
-    // Enforce no penetration for thin immersed body
-    if (xflux_imask[lev]) {
-        ApplyMask(*mfs_mom[IntVars::xmom], *xflux_imask[lev]);
-    }
-    if (yflux_imask[lev]) {
-        ApplyMask(*mfs_mom[IntVars::ymom], *yflux_imask[lev]);
-    }
-    if (zflux_imask[lev]) {
-        ApplyMask(*mfs_mom[IntVars::zmom], *zflux_imask[lev]);
-    }
-
-    // We always come in to this call with updated momenta but we need to create updated velocity
-    //    in order to impose the rest of the bc's
-    if (!cons_only) {
-        // This only fills VALID region of velocity
-        MomentumToVelocity(*mfs_vel[Vars::xvel], *mfs_vel[Vars::yvel], *mfs_vel[Vars::zvel],
-                           *mfs_vel[Vars::cons],
-                           *mfs_mom[IntVars::xmom], *mfs_mom[IntVars::ymom], *mfs_mom[IntVars::zmom],
-                            Geom(lev).Domain(), domain_bcs_type);
-    }
-
-    // We now start working on conserved quantities + VELOCITY
-    for (int var_idx = 0; var_idx < Vars::NumTypes; ++var_idx)
-    {
-        if (cons_only && var_idx != Vars::cons) continue;
-
-        MultiFab& mf = *mfs_vel[var_idx];
-
-        IntVect ngvect;
-        int icomp, ncomp;
-        if (var_idx == Vars::cons)
-        {
-            bccomp = icomp_cons;
-            mapper = &cell_cons_interp;
-            ngvect = IntVect(ng_cons,ng_cons,ng_cons);
-            icomp  = icomp_cons;
-            ncomp  = ncomp_cons;
-        }
-        else if (var_idx == IntVars::xmom)
-        {
-            bccomp = BCVars::xvel_bc;
-            mapper = &face_cons_linear_interp;
-            ngvect = IntVect(ng_vel,ng_vel,ng_vel);
-            icomp  = 0;
-            ncomp  = 1;
-        }
-        else if (var_idx == IntVars::ymom)
-        {
-            bccomp = BCVars::yvel_bc;
-            mapper = &face_cons_linear_interp;
-            ngvect = IntVect(ng_vel,ng_vel,ng_vel);
-            icomp  = 0;
-            ncomp  = 1;
-        }
-        else if (var_idx == IntVars::zmom)
-        {
-            bccomp = BCVars::zvel_bc;
-            mapper = &face_cons_linear_interp;
-            ngvect = IntVect(ng_vel,ng_vel,ng_vel);
-            icomp  = 0;
-            ncomp  = 1;
-        }
-
-        if (lev == 0)
-        {
-            // This fills fine-fine ghost values of cons and VELOCITY (not momentum)
-            mf.FillBoundary(icomp,ncomp,ngvect,geom[lev].periodicity());
-        }
-        else
-        {
-            //
-            // NOTE: All interpolation here happens on velocities not momenta;
-            //       note we only do the interpolation and FillBoundary here,
-            //       physical bc's are imposed later
-            //
-            // NOTE: This will only fill velocity from coarse grid *outside* the fine grids
-            //       unlike the FillSet calls above which filled momenta on the coarse/fine bdy
-            //
-            Vector<MultiFab*> fmf = {&mf,&mf};
-            Vector<MultiFab*> cmf = {&vars_old[lev-1][var_idx], &vars_new[lev-1][var_idx]};
-            Vector<Real> ctime    = {t_old[lev-1], t_new[lev-1]};
-            Vector<Real> ftime    = {time,time};
-
-            if (var_idx == Vars::cons) {
-                // Impose physical bc's on coarse data (note time and 0 are not used)
-                (*physbcs_cons[lev-1])(vars_old[lev-1][Vars::cons],0,ncomp,ngvect,time,BCVars::cons_bc);
-                (*physbcs_cons[lev-1])(vars_new[lev-1][Vars::cons],0,ncomp,ngvect,time,BCVars::cons_bc);
-
-                // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
-                FillPatchTwoLevels(mf, ngvect, IntVect(0,0,0),
-                                   time, cmf, ctime, fmf, ftime,
-                                   0, 0, ncomp, geom[lev-1], geom[lev],
-                                   refRatio(lev-1), mapper, domain_bcs_type,
-                                   bccomp);
-
-                // Impose physical bc's on fine data
-                (*physbcs_cons[lev])(mf,0,ncomp,ngvect,time,BCVars::cons_bc);
-
-            } else if (var_idx == Vars::xvel) {
-
-                // Impose physical bc's on coarse data (note time and 0 are not used)
-                (*physbcs_u[lev-1])(vars_old[lev-1][Vars::xvel],0,1,ngvect,time,BCVars::xvel_bc);
-                (*physbcs_u[lev-1])(vars_new[lev-1][Vars::xvel],0,1,ngvect,time,BCVars::xvel_bc);
-
-                // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
-                FillPatchTwoLevels(mf, ngvect, IntVect(0,0,0),
-                                   time, cmf, ctime, fmf, ftime,
-                                   0, 0, ncomp, geom[lev-1], geom[lev],
-                                   refRatio(lev-1), mapper, domain_bcs_type,
-                                   bccomp);
-
-                // Impose physical bc's on fine data
-                (*physbcs_u[lev])(mf,0,1,ngvect,time,BCVars::xvel_bc);
-
-            } else if (var_idx == Vars::yvel) {
-
-                // Impose physical bc's on coarse data (note time and 0 are not used)
-                (*physbcs_v[lev-1])(vars_old[lev-1][Vars::yvel],0,1,ngvect,time,BCVars::yvel_bc);
-                (*physbcs_v[lev-1])(vars_new[lev-1][Vars::yvel],0,1,ngvect,time,BCVars::yvel_bc);
-
-                // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
-                FillPatchTwoLevels(mf, ngvect, IntVect(0,0,0),
-                                   time, cmf, ctime, fmf, ftime,
-                                   0, 0, 1, geom[lev-1], geom[lev],
-                                   refRatio(lev-1), mapper, domain_bcs_type,
-                                   bccomp);
-
-                // Impose physical bc's on fine data
-                (*physbcs_v[lev])(mf,0,1,ngvect,time,BCVars::yvel_bc);
-
-            } else if (var_idx == Vars::zvel) {
-
-                // Impose physical bc's on coarse data (note time and 0 are not used)
-                (*physbcs_w[lev-1])(vars_old[lev-1][Vars::zvel],
-                                    vars_old[lev-1][Vars::xvel],
-                                    vars_old[lev-1][Vars::yvel],
-                                    ngvect,time,BCVars::zvel_bc);
-                (*physbcs_w[lev-1])(vars_new[lev-1][Vars::zvel],
-                                    vars_new[lev-1][Vars::xvel],
-                                    vars_new[lev-1][Vars::yvel],
-                                    ngvect,time,BCVars::zvel_bc);
-
-                // Call FillPatchTwoLevels which ASSUMES that all ghost cells have already been filled
-                FillPatchTwoLevels(mf, ngvect, IntVect(0,0,0),
-                                   time, cmf, ctime, fmf, ftime,
-                                   0, 0, 1, geom[lev-1], geom[lev],
-                                   refRatio(lev-1), mapper, domain_bcs_type,
-                                   bccomp);
-
-                // Impose physical bc's on fine data
-                (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
-                                   ngvect,time,BCVars::zvel_bc);
-            }
-        } // lev > 0
-    } // var_idx
-
-    // ***************************************************************************
-    // Physical bc's at domain boundary
-    // ***************************************************************************
-    IntVect ngvect_cons = IntVect(ng_cons,ng_cons,ng_cons);
-    IntVect ngvect_vels = IntVect(ng_vel ,ng_vel ,ng_vel);
+    bool do_fb = true;
 
 #ifdef ERF_USE_NETCDF
     // We call this here because it is an ERF routine
     if (use_real_bcs && (lev==0)) {
-        fill_from_realbdy(mfs_vel,time,cons_only,icomp_cons,ncomp_cons,ngvect_cons, ngvect_vels);
+        fill_from_realbdy(mfs_vel,time,cons_only,icomp_cons,ncomp_cons,ngvect_cons,ngvect_vels);
+        do_fb = false;
     }
 #endif
 
     if (m_r2d) fill_from_bndryregs(mfs_vel,time);
 
-    // We call this even if init_type == real because this routine will fill the vertical bcs
-    (*physbcs_cons[lev])(*mfs_vel[Vars::cons],icomp_cons,ncomp_cons,ngvect_cons,time,BCVars::cons_bc);
+    // We call these even if init_type == InitType::Real because these will fill the vertical bcs
+    // Note that we call FillBoundary inside the physbcs call
+    (*physbcs_cons[lev])(*mfs_vel[Vars::cons],icomp_cons,ncomp_cons,ngvect_cons,time,BCVars::cons_bc, do_fb);
     if (!cons_only) {
-        (*physbcs_u[lev])(*mfs_vel[Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc);
-        (*physbcs_v[lev])(*mfs_vel[Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc);
+        (*physbcs_u[lev])(*mfs_vel[Vars::xvel],0,1,ngvect_vels,time,BCVars::xvel_bc, do_fb);
+        (*physbcs_v[lev])(*mfs_vel[Vars::yvel],0,1,ngvect_vels,time,BCVars::yvel_bc, do_fb);
         (*physbcs_w[lev])(*mfs_vel[Vars::zvel],*mfs_vel[Vars::xvel],*mfs_vel[Vars::yvel],
-                          ngvect_vels,time,BCVars::zvel_bc);
-    }
-    // ***************************************************************************
-
-    // MOST boundary conditions
-    if (!(cons_only && ncomp_cons == 1) && m_most && allow_most_bcs) {
-        m_most->impose_most_bcs(lev,mfs_vel,
-                                Tau11_lev[lev].get(),
-                                Tau22_lev[lev].get(),
-                                Tau33_lev[lev].get(),
-                                Tau12_lev[lev].get(), Tau21_lev[lev].get(),
-                                Tau13_lev[lev].get(), Tau31_lev[lev].get(),
-                                Tau23_lev[lev].get(), Tau32_lev[lev].get(),
-                                SFS_hfx1_lev[lev].get(),
-                                SFS_hfx2_lev[lev].get(),
-                                SFS_hfx3_lev[lev].get(),
-                                SFS_q1fx1_lev[lev].get(),
-                                SFS_q1fx2_lev[lev].get(),
-                                SFS_q1fx3_lev[lev].get(),
-                                z_phys_nd[lev].get());
-    }
-
-    // We always come in to this call with momenta so we need to leave with momenta!
-    // We need to make sure we convert back on all ghost cells/faces because this is
-    // how velocity from fine-fine copies (as well as physical and interpolated bcs) will be filled
-    if (!cons_only) {
-        IntVect ngu = mfs_vel[Vars::xvel]->nGrowVect();
-        IntVect ngv = mfs_vel[Vars::yvel]->nGrowVect();
-        IntVect ngw = mfs_vel[Vars::zvel]->nGrowVect();
-
-        if (!solverChoice.use_NumDiff) {
-            ngu = IntVect(1,1,1);
-            ngv = IntVect(1,1,1);
-            ngw = IntVect(1,1,1);
-        }
-        VelocityToMomentum(*mfs_vel[Vars::xvel], ngu,
-                           *mfs_vel[Vars::yvel], ngv,
-                           *mfs_vel[Vars::zvel], ngw,
-                           *mfs_vel[Vars::cons],
-                           *mfs_mom[IntVars::xmom], *mfs_mom[IntVars::ymom], *mfs_mom[IntVars::zmom],
-                           Geom(lev).Domain(),
-                           domain_bcs_type);
-    }
-
-    mfs_mom[Vars::cons]->FillBoundary(geom[lev].periodicity());
-}
-
-/*
- * Fill valid and ghost data.
- * This version fills an entire MultiFab by interpolating from the coarser level -- this is used
- * only when a new level of refinement is being created during a run (i.e not at initialization)
- * This will never be used with static refinement.
- *
- * @param[in]  lev            level of refinement at which to fill the data
- * @param[in]  time           time at which the data should be filled
- * @param[out] mfs            Vector of MultiFabs to be filled containing, in order: cons, xvel, yvel, and zvel data
- */
-void
-ERF::FillCoarsePatch (int lev, Real time)
-{
-    BL_PROFILE_VAR("FillCoarsePatch()",FillCoarsePatch);
-    AMREX_ASSERT(lev > 0);
-
-    //
-    //****************************************************************************************************************
-    // First fill velocities and density at the COARSE level so we can convert velocity to momenta at the COARSE level
-    //****************************************************************************************************************
-    //
-    bool cons_only = false;
-    FillPatch(lev-1, time, {&vars_new[lev-1][Vars::cons], &vars_new[lev-1][Vars::xvel],
-                            &vars_new[lev-1][Vars::yvel], &vars_new[lev-1][Vars::zvel]},
-                           {&vars_new[lev-1][Vars::cons],
-                            &rU_new[lev-1], &rV_new[lev-1], &rW_new[lev-1]},
-                            false, cons_only);
-
-    //
-    // ************************************************
-    // Convert velocity to momentum at the COARSE level
-    // ************************************************
-    //
-    IntVect ngu = IntVect(0,0,0);
-    IntVect ngv = IntVect(0,0,0);
-    IntVect ngw = IntVect(0,0,0);
-
-    VelocityToMomentum(vars_new[lev-1][Vars::xvel], ngu,
-                       vars_new[lev-1][Vars::yvel], ngv,
-                       vars_new[lev-1][Vars::zvel], ngw,
-                       vars_new[lev-1][Vars::cons],
-                         rU_new[lev-1],
-                         rV_new[lev-1],
-                         rW_new[lev-1],
-                       Geom(lev).Domain(),
-                       domain_bcs_type);
-    //
-    // *****************************************************************
-    // Interpolate all cell-centered variables from coarse to fine level
-    // *****************************************************************
-    //
-    Interpolater* mapper_c = &cell_cons_interp;
-    Interpolater* mapper_f = &face_cons_linear_interp;
-
-    //
-    //************************************************************************************************
-    // Interpolate cell-centered data from coarse to fine level
-    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
-    // ************************************************************************************************
-    IntVect ngvect_cons = vars_new[lev][Vars::cons].nGrowVect();
-    int      ncomp_cons = vars_new[lev][Vars::cons].nComp();
-
-    InterpFromCoarseLevel(vars_new[lev  ][Vars::cons], ngvect_cons, IntVect(0,0,0),
-                          vars_new[lev-1][Vars::cons], 0, 0, ncomp_cons,
-                          geom[lev-1], geom[lev],
-                          refRatio(lev-1), mapper_c, domain_bcs_type, BCVars::cons_bc);
-
-    //
-    //************************************************************************************************
-    // Interpolate x-momentum from coarse to fine level
-    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
-    // ************************************************************************************************
-    //
-    InterpFromCoarseLevel(rU_new[lev], ngu, IntVect(0,0,0), rU_new[lev-1], 0, 0, 1,
-                          geom[lev-1], geom[lev],
-                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::xvel_bc);
-
-    //
-    //************************************************************************************************
-    // Interpolate y-momentum from coarse to fine level
-    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
-    // ************************************************************************************************
-    //
-    InterpFromCoarseLevel(rV_new[lev], ngv, IntVect(0,0,0), rV_new[lev-1], 0, 0, 1,
-                          geom[lev-1], geom[lev],
-                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::yvel_bc);
-
-    //************************************************************************************************
-    // Interpolate z-momentum from coarse to fine level
-    // with InterpFromCoarseLevel which ASSUMES that all ghost cells have already been filled
-    // ************************************************************************************************
-    InterpFromCoarseLevel(rW_new[lev], ngw, IntVect(0,0,0), rW_new[lev-1], 0, 0, 1,
-                          geom[lev-1], geom[lev],
-                          refRatio(lev-1), mapper_f, domain_bcs_type, BCVars::zvel_bc);
-    //
-    // *********************************************************
-    // After interpolation of momentum, convert back to velocity
-    // *********************************************************
-    //
-    for (int which_lev = lev-1; which_lev <= lev; which_lev++)
-    {
-        MomentumToVelocity(vars_new[which_lev][Vars::xvel],
-                           vars_new[which_lev][Vars::yvel],
-                           vars_new[which_lev][Vars::zvel],
-                           vars_new[which_lev][Vars::cons],
-                             rU_new[which_lev],
-                             rV_new[which_lev],
-                             rW_new[which_lev],
-                           Geom(lev).Domain(),
-                           domain_bcs_type);
+                          ngvect_vels,time,BCVars::zvel_bc, do_fb);
     }
-
-    vars_new[lev][Vars::cons].FillBoundary(geom[lev].periodicity());
-    vars_new[lev][Vars::xvel].FillBoundary(geom[lev].periodicity());
-    vars_new[lev][Vars::yvel].FillBoundary(geom[lev].periodicity());
-    vars_new[lev][Vars::zvel].FillBoundary(geom[lev].periodicity());
-
-    // ***************************************************************************
-    // Physical bc's at domain boundary
-    // ***************************************************************************
-    IntVect ngvect_vels = vars_new[lev][Vars::xvel].nGrowVect();
-
-    (*physbcs_cons[lev])(vars_new[lev][Vars::cons],0,ncomp_cons,ngvect_cons,time,BCVars::cons_bc);
-    (   *physbcs_u[lev])(vars_new[lev][Vars::xvel],0,1         ,ngvect_vels,time,BCVars::xvel_bc);
-    (   *physbcs_v[lev])(vars_new[lev][Vars::yvel],0,1         ,ngvect_vels,time,BCVars::yvel_bc);
-    (   *physbcs_w[lev])(vars_new[lev][Vars::zvel],vars_new[lev][Vars::xvel],vars_new[lev][Vars::yvel],
-                         ngvect_vels,time,BCVars::zvel_bc);
-
-    // ***************************************************************************
-    // Since lev > 0 here we don't worry about m_r2d or wrfbdy data
-    // ***************************************************************************
-}
-
-void
-ERF::FillBdyCCVels (Vector<MultiFab>& mf_cc_vel)
-{
-    // Impose bc's at domain boundaries
-    for (int lev = 0; lev <= finest_level; ++lev)
-    {
-        Box domain(Geom(lev).Domain());
-
-        int ihi = domain.bigEnd(0);
-        int jhi = domain.bigEnd(1);
-        int khi = domain.bigEnd(2);
-
-        // Impose periodicity first
-        mf_cc_vel[lev].FillBoundary(geom[lev].periodicity());
-
-        for (MFIter mfi(mf_cc_vel[lev], TilingIfNotGPU()); mfi.isValid(); ++mfi)
-        {
-            // Note that we don't fill corners here -- only the cells that share a face
-            //      with interior cells -- this is all that is needed to calculate vorticity
-            const Box& bx = mfi.tilebox();
-            const Array4<Real>& vel_arr = mf_cc_vel[lev].array(mfi);
-
-            if (!Geom(lev).isPeriodic(0)) {
-                // Low-x side
-                if (bx.smallEnd(0) <= domain.smallEnd(0)) {
-                    Real mult = (phys_bc_type[0] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,0,0), [=] AMREX_GPU_DEVICE(int , int j, int k) noexcept
-                    {
-                        vel_arr(-1,j,k,1) = mult*vel_arr(0,j,k,1); // v
-                        vel_arr(-1,j,k,2) = mult*vel_arr(0,j,k,2); // w
-                    });
-                }
-
-                // High-x side
-                if (bx.bigEnd(0) >= domain.bigEnd(0)) {
-                    Real mult = (phys_bc_type[3] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,0,0), [=] AMREX_GPU_DEVICE(int , int j, int k) noexcept
-                    {
-                        vel_arr(ihi+1,j,k,1) = mult*vel_arr(ihi,j,k,1); // v
-                        vel_arr(ihi+1,j,k,2) = mult*vel_arr(ihi,j,k,2); // w
-                    });
-                }
-            } // !periodic
-
-            if (!Geom(lev).isPeriodic(1)) {
-                // Low-y side
-                if (bx.smallEnd(1) <= domain.smallEnd(1)) {
-                    Real mult = (phys_bc_type[1] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,1,0), [=] AMREX_GPU_DEVICE(int i, int  , int k) noexcept
-                    {
-                        vel_arr(i,-1,k,0) = mult*vel_arr(i,0,k,0); // u
-                        vel_arr(i,-1,k,2) = mult*vel_arr(i,0,k,2); // w
-                    });
-                }
-
-                // High-y side
-                if (bx.bigEnd(1) >= domain.bigEnd(1)) {
-                    Real mult = (phys_bc_type[4] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,1,0), [=] AMREX_GPU_DEVICE(int i, int , int k) noexcept
-                    {
-                        vel_arr(i,jhi+1,k,0) = mult*vel_arr(i,jhi,k,0); // u
-                        vel_arr(i,jhi+1,k,2) = mult*-vel_arr(i,jhi,k,2); // w
-                    });
-                }
-            } // !periodic
-
-            if (!Geom(lev).isPeriodic(2)) {
-                // Low-z side
-                if (bx.smallEnd(2) <= domain.smallEnd(2)) {
-                    Real mult = (phys_bc_type[2] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,2,0), [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                    {
-                        vel_arr(i,j,-1,0) = mult*vel_arr(i,j,0,0); // u
-                        vel_arr(i,j,-1,1) = mult*vel_arr(i,j,0,1); // v
-                    });
-                }
-
-                // High-z side
-                if (bx.bigEnd(2) >= domain.bigEnd(2)) {
-                    Real mult = (phys_bc_type[5] == ERF_BC::no_slip_wall) ? -1. : 1.;
-                    ParallelFor(makeSlab(bx,2,0), [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-                    {
-                        vel_arr(i,j,khi+1,0) = mult*vel_arr(i,j,khi,0); // u
-                        vel_arr(i,j,khi+1,1) = mult*vel_arr(i,j,khi,1); // v
-                    });
-                }
-            } // !periodic
-        } // MFIter
-
-    } // lev
 }
diff --git a/Source/BoundaryConditions/ERF_PhysBCFunct.H b/Source/BoundaryConditions/ERF_PhysBCFunct.H
index ea5ae9b00..d74a9834e 100644
--- a/Source/BoundaryConditions/ERF_PhysBCFunct.H
+++ b/Source/BoundaryConditions/ERF_PhysBCFunct.H
@@ -52,11 +52,12 @@ public:
     * @param[in] use_real_bcs if true then we fill boundary conditions for interior locations
     */
     void operator() (amrex::MultiFab& mf, int icomp, int ncomp,
-                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp_cons);
+                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp_cons,
+                     bool do_fb);
 
     void impose_lateral_cons_bcs (const amrex::Array4<amrex::Real>& dest_arr,
                                   const amrex::Box& bx, const amrex::Box& domain,
-                                  int icomp, int ncomp, int ngz);
+                                  int icomp, int ncomp, amrex::IntVect ng);
     void impose_vertical_cons_bcs (const amrex::Array4<amrex::Real>& dest_arr,
                                    const amrex::Box& bx, const amrex::Box& domain,
                                    const amrex::Array4<amrex::Real const>& z_nd,
@@ -106,7 +107,8 @@ public:
     * @param[in] use_real_bcs if true then we fill boundary conditions for interior locations
     */
     void operator() (amrex::MultiFab& mf, int icomp, int ncomp,
-                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp);
+                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp,
+                     bool do_fb);
 
     void impose_lateral_xvel_bcs (const amrex::Array4<amrex::Real>& dest_arr,
                                   const amrex::Box& bx, const amrex::Box& domain,
@@ -164,7 +166,8 @@ public:
     * @param[in] use_real_bcs if true then we fill boundary conditions for interior locations
     */
     void operator() (amrex::MultiFab& mf, int icomp, int ncomp,
-                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp);
+                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp,
+                     bool do_fb);
 
     void impose_lateral_yvel_bcs (const amrex::Array4<amrex::Real>& dest_arr,
                                   const amrex::Box& bx, const amrex::Box& domain,
@@ -222,7 +225,8 @@ public:
     * @param[in] use_real_bcs if true then we fill boundary conditions for interior locations
     */
     void operator() (amrex::MultiFab& mf, amrex::MultiFab& xvel, amrex::MultiFab& yvel,
-                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp);
+                     amrex::IntVect const& nghost, const amrex::Real time, int bccomp,
+                     bool do_fb);
 
     void impose_lateral_zvel_bcs (const amrex::Array4<amrex::Real>& dest_arr,
                                   const amrex::Array4<amrex::Real const>& xvel_arr,
diff --git a/Source/BoundaryConditions/ERF_PhysBCFunct.cpp b/Source/BoundaryConditions/ERF_PhysBCFunct.cpp
index 06becebfc..5ab47fb88 100644
--- a/Source/BoundaryConditions/ERF_PhysBCFunct.cpp
+++ b/Source/BoundaryConditions/ERF_PhysBCFunct.cpp
@@ -14,7 +14,8 @@ using namespace amrex;
  */
 
 void ERFPhysBCFunct_cons::operator() (MultiFab& mf, int icomp, int ncomp,
-                                      IntVect const& nghost, const Real /*time*/, int /*bccomp*/)
+                                      IntVect const& nghost, const Real /*time*/, int /*bccomp*/,
+                                      bool do_fb)
 {
     BL_PROFILE("ERFPhysBCFunct_cons::()");
 
@@ -46,6 +47,14 @@ void ERFPhysBCFunct_cons::operator() (MultiFab& mf, int icomp, int ncomp,
                                                     z_nd_mf_loc.nGrowVect());
     }
 
+    //
+    // We fill all of the interior and periodic ghost cells first, so we can fill
+    //    those directly inside the lateral and vertical calls.
+    //
+    if (do_fb) {
+        mf.FillBoundary(m_geom.periodicity());
+    }
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
@@ -77,9 +86,11 @@ void ERFPhysBCFunct_cons::operator() (MultiFab& mf, int icomp, int ncomp,
 
                 if (!m_use_real_bcs)
                 {
-                    impose_lateral_cons_bcs(cons_arr,cbx1,domain,icomp,ncomp,nghost[2]);
+                    // We send a box with ghost cells in the lateral directions only
+                    impose_lateral_cons_bcs(cons_arr,cbx1,domain,icomp,ncomp,nghost);
                 }
 
+                // We send the full FAB box with ghost cells
                 impose_vertical_cons_bcs(cons_arr,cbx2,domain,z_nd_arr,dxInv,icomp,ncomp);
             }
 
@@ -88,7 +99,8 @@ void ERFPhysBCFunct_cons::operator() (MultiFab& mf, int icomp, int ncomp,
 } // operator()
 
 void ERFPhysBCFunct_u::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
-                                   IntVect const& nghost, const Real time, int bccomp)
+                                   IntVect const& nghost, const Real time, int bccomp,
+                                   bool do_fb)
 {
     BL_PROFILE("ERFPhysBCFunct_u::()");
 
@@ -118,6 +130,14 @@ void ERFPhysBCFunct_u::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
                                                     z_nd_mf_loc.nGrowVect());
     }
 
+    //
+    // We fill all of the interior and periodic ghost cells first, so we can fill
+    //    those directly inside the lateral and vertical calls.
+    //
+    if (do_fb) {
+        mf.FillBoundary(m_geom.periodicity());
+    }
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
@@ -133,7 +153,10 @@ void ERFPhysBCFunct_u::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
             //
             // These are the boxes we use to test on relative to the domain
             //
-            Box xbx1 = surroundingNodes(bx,0); xbx1.grow(IntVect(nghost[0],nghost[1],0));
+            Box xbx1 = surroundingNodes(bx,0); xbx1.grow(nghost);
+            if(xbx1.smallEnd(2) < domain.smallEnd(2)) xbx1.setSmall(2,domain.smallEnd(2));
+            if(xbx1.bigEnd(2)   > domain.bigEnd(2))   xbx1.setBig(2,domain.bigEnd(2));
+
             Box xbx2 = surroundingNodes(bx,0); xbx2.grow(nghost);
 
             Array4<const Real> z_nd_arr;
@@ -157,13 +180,13 @@ void ERFPhysBCFunct_u::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
 
                 impose_vertical_xvel_bcs(velx_arr,xbx2,domain,z_nd_arr,dxInv,bccomp,time);
             }
-
         } // MFIter
     } // OpenMP
 } // operator()
 
 void ERFPhysBCFunct_v::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
-                                   IntVect const& nghost, const Real /*time*/, int bccomp)
+                                   IntVect const& nghost, const Real /*time*/, int bccomp,
+                                   bool do_fb)
 {
     BL_PROFILE("ERFPhysBCFunct_v::()");
 
@@ -193,6 +216,14 @@ void ERFPhysBCFunct_v::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
                                                     z_nd_mf_loc.nGrowVect());
     }
 
+    //
+    // We fill all of the interior and periodic ghost cells first, so we can fill
+    //    those directly inside the lateral and vertical calls.
+    //
+    if (do_fb) {
+        mf.FillBoundary(m_geom.periodicity());
+    }
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
@@ -208,7 +239,10 @@ void ERFPhysBCFunct_v::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
             //
             // These are the boxes we use to test on relative to the domain
             //
-            Box ybx1 = surroundingNodes(bx,1); ybx1.grow(IntVect(nghost[0],nghost[1],0));
+            Box ybx1 = surroundingNodes(bx,1); ybx1.grow(nghost);
+            if (ybx1.smallEnd(2) < domain.smallEnd(2)) ybx1.setSmall(2,domain.smallEnd(2));
+            if (ybx1.bigEnd(2)   > domain.bigEnd(2))   ybx1.setBig(2,domain.bigEnd(2));
+
             Box ybx2 = surroundingNodes(bx,1); ybx2.grow(nghost);
 
             Array4<const Real> z_nd_arr;
@@ -236,7 +270,7 @@ void ERFPhysBCFunct_v::operator() (MultiFab& mf, int /*icomp*/, int /*ncomp*/,
 
 void ERFPhysBCFunct_w::operator() (MultiFab& mf, MultiFab& xvel, MultiFab& yvel,
                                    IntVect const& nghost, const Real /*time*/,
-                                   const int bccomp_w)
+                                   const int bccomp_w, bool do_fb)
 {
     BL_PROFILE("ERFPhysBCFunct_w::()");
 
@@ -254,7 +288,9 @@ void ERFPhysBCFunct_w::operator() (MultiFab& mf, MultiFab& xvel, MultiFab& yvel,
             gdomainz.grow(i, nghost[i]);
         }
     }
+    //
     // We want to make sure we impose the z-vels at k=0  if the box includes k=0
+    //
     if (gdomainz.smallEnd(2) == 0) gdomainz.setSmall(2,1);
 
     Box ndomain  = convert(domain,IntVect(1,1,1));
@@ -272,6 +308,14 @@ void ERFPhysBCFunct_w::operator() (MultiFab& mf, MultiFab& xvel, MultiFab& yvel,
     }
     z_nd_mf_loc.FillBoundary(m_geom.periodicity());
 
+    //
+    // We fill all of the interior and periodic ghost cells first, so we can fill
+    //    those directly inside the lateral and vertical calls.
+    //
+    if (do_fb) {
+        mf.FillBoundary(m_geom.periodicity());
+    }
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
@@ -287,8 +331,10 @@ void ERFPhysBCFunct_w::operator() (MultiFab& mf, MultiFab& xvel, MultiFab& yvel,
             //
             // These are the boxes we use to test on relative to the domain
             //
-            Box zbx = surroundingNodes(bx,2); zbx.grow(0,nghost[0]);
-                                              zbx.grow(1,nghost[1]);
+            Box zbx = surroundingNodes(bx,2); zbx.grow(nghost);
+            if (zbx.smallEnd(2) < domain.smallEnd(2)) zbx.setSmall(2,domain.smallEnd(2));
+            if (zbx.bigEnd(2)   > domain.bigEnd(2))   zbx.setBig(2,domain.bigEnd(2)+1);
+
             Array4<const Real> z_nd_arr;
 
             if (m_z_phys_nd)
@@ -296,19 +342,23 @@ void ERFPhysBCFunct_w::operator() (MultiFab& mf, MultiFab& xvel, MultiFab& yvel,
                 z_nd_arr = z_nd_mf_loc.const_array(mfi);
             }
 
-            Array4<const Real> const& velx_arr = xvel.const_array(mfi);
-            Array4<const Real> const& vely_arr = yvel.const_array(mfi);
-            Array4<      Real> const& velz_arr = mf.array(mfi);
-
-            if (!m_use_real_bcs)
+            //
+            // Recall that gdomainz.smallEnd(2) = 1 not 0!
+            //
+            if (!gdomainz.contains(zbx))
             {
-                if (!gdomainz.contains(zbx))
+                Array4<const Real> const& velx_arr = xvel.const_array(mfi);
+                Array4<const Real> const& vely_arr = yvel.const_array(mfi);
+                Array4<      Real> const& velz_arr = mf.array(mfi);
+
+                if (!m_use_real_bcs)
                 {
-                    impose_lateral_zvel_bcs(velz_arr,velx_arr,vely_arr,zbx,domain,z_nd_arr,dxInv,bccomp_w);
+                    if (!gdomainz.contains(zbx))
+                    {
+                        impose_lateral_zvel_bcs(velz_arr,velx_arr,vely_arr,zbx,domain,z_nd_arr,dxInv,bccomp_w);
+                    }
                 }
-            } // m_use_real_bcs
 
-            if (!gdomainz.contains(zbx)) {
                 impose_vertical_zvel_bcs(velz_arr,velx_arr,vely_arr,zbx,domain,z_nd_arr,dxInv,
                                          bccomp_u, bccomp_v, bccomp_w, m_terrain_type);
             }
diff --git a/Source/BoundaryConditions/Make.package b/Source/BoundaryConditions/Make.package
index ca1133adb..2c02b46cb 100644
--- a/Source/BoundaryConditions/Make.package
+++ b/Source/BoundaryConditions/Make.package
@@ -10,6 +10,9 @@ CEXE_headers += ERF_ABLMost.H
 CEXE_sources += ERF_ABLMost.cpp
 
 CEXE_sources += ERF_FillPatch.cpp
+CEXE_sources += ERF_FillCoarsePatch.cpp
+CEXE_sources += ERF_FillIntermediatePatch.cpp
+CEXE_sources += ERF_FillBdyCCVels.cpp
 CEXE_sources += ERF_FillPatcher.cpp
 
 CEXE_sources += ERF_PhysBCFunct.cpp
diff --git a/Source/DataStructs/ERF_DataStruct.H b/Source/DataStructs/ERF_DataStruct.H
index 9e7cc1470..645b5c7a5 100644
--- a/Source/DataStructs/ERF_DataStruct.H
+++ b/Source/DataStructs/ERF_DataStruct.H
@@ -42,7 +42,7 @@ AMREX_ENUM(MoistureType,
 );
 
 AMREX_ENUM(WindFarmType,
-    Fitch, EWP, SimpleActuatorDisk, GeneralActuatorDisk, None
+    Fitch, EWP, SimpleAD, GeneralAD, None
 );
 
 AMREX_ENUM(WindFarmLocType,
@@ -50,7 +50,7 @@ AMREX_ENUM(WindFarmLocType,
 );
 
 AMREX_ENUM(LandSurfaceType,
-    SLM, MM5, None
+    SLM, MM5, None, NOAH
 );
 
 AMREX_ENUM(PerturbationType,
@@ -339,7 +339,7 @@ struct SolverChoice {
 
 
         // Which type of multilevel coupling
-        coupling_type = CouplingType::TwoWay; // Default
+        coupling_type = CouplingType::OneWay; // Default
         pp.query_enum_case_insensitive("coupling_type",coupling_type);
 
         // Which type of windfarm model
@@ -353,7 +353,8 @@ struct SolverChoice {
         pp.query("windfarm_loc_table",  windfarm_loc_table);
         pp.query("windfarm_spec_table", windfarm_spec_table);
         pp.query("windfarm_blade_table", windfarm_blade_table);
-        pp.query("windfarm_airofil_tables", windfarm_airfoil_tables);
+        pp.query("windfarm_airfoil_tables", windfarm_airfoil_tables);
+        pp.query("windfarm_spec_table_extra", windfarm_spec_table_extra);
 
         // Sampling distance upstream of the turbine to find the
         // incoming free stream velocity as a factor of the diameter of the
@@ -392,14 +393,14 @@ struct SolverChoice {
         //
         // Wind farm checks
         //
-        if (windfarm_type==WindFarmType::SimpleActuatorDisk and sampling_distance_by_D < 0.0) {
+        if (windfarm_type==WindFarmType::SimpleAD and sampling_distance_by_D < 0.0) {
              amrex::Abort("To use simplified actuator disks, you need to provide a variable"
                            " erf.sampling_distance_by_D in the inputs which specifies the upstream"
                            " distance as a factor of the turbine diameter at which the incoming free stream"
                            " velocity will be computed at.");
         }
-        if ( (windfarm_type==WindFarmType::SimpleActuatorDisk ||
-              windfarm_type==WindFarmType::GeneralActuatorDisk ) && turb_disk_angle < 0.0) {
+        if ( (windfarm_type==WindFarmType::SimpleAD ||
+              windfarm_type==WindFarmType::GeneralAD ) && turb_disk_angle < 0.0) {
             amrex::Abort("To use simplified actuator disks, you need to provide a variable"
                           " erf.turb_disk_angle_from_x in the inputs which is the angle of the face of the"
                           " turbine disk from the x-axis. A turbine facing an oncoming flow in the x-direction"
@@ -430,35 +431,35 @@ struct SolverChoice {
         amrex::Print() << "use_coriolis                : " << use_coriolis << std::endl;
         amrex::Print() << "use_gravity                 : " << use_gravity << std::endl;
 
-        if (coupling_type == CouplingType::TwoWay) {
-            amrex::Print() << "Using two-way coupling " << std::endl;
-        } else if (coupling_type == CouplingType::OneWay) {
-            amrex::Print() << "Using one-way coupling " << std::endl;
-        }
-
+        amrex::Print() << "Terrain Type: " << std::endl;
         if (terrain_type == TerrainType::Static) {
-            amrex::Print() << "Using static terrain " << std::endl;
+            amrex::Print() << "    Static" << std::endl;
         } else if (terrain_type == TerrainType::Moving) {
-            amrex::Print() << "Using moving terrain " << std::endl;
+            amrex::Print() << "    Moving" << std::endl;
         } else {
-            amrex::Print() << "No terrain " << std::endl;
+            amrex::Print() << "    None" << std::endl;
         }
 
+        amrex::Print() << "ABL Driver Type: " << std::endl;
         if (abl_driver_type == ABLDriverType::None) {
-            amrex::Print() << "ABL Driver Type: " << "None" << std::endl;
-            amrex::Print() << "No ABL driver selected " << std::endl;
+            amrex::Print() << "    None" << std::endl;
         } else if (abl_driver_type == ABLDriverType::PressureGradient) {
-            amrex::Print() << "ABL Driver Type: " << "PressureGradient" << std::endl;
-            amrex::Print() << "Driving abl_pressure_grad: (";
-            for (int i = 0; i < AMREX_SPACEDIM; ++i)
-                amrex::Print() << abl_pressure_grad[i] << " ";
-            amrex::Print() << ")" << std::endl;
+            amrex::Print() << "    Pressure Gradient "
+                           << amrex::RealVect(abl_pressure_grad[0],abl_pressure_grad[1],abl_pressure_grad[2])
+                           << std::endl;
         } else if (abl_driver_type == ABLDriverType::GeostrophicWind) {
-            amrex::Print() << "ABL Driver Type: " << "GeostrophicWind" << std::endl;
-            amrex::Print() << "Driving abl_geo_forcing: (";
-            for (int i = 0; i < AMREX_SPACEDIM; ++i)
-                amrex::Print() << abl_geo_forcing[i] << " ";
-            amrex::Print() << ")" << std::endl;
+            amrex::Print() << "    Geostrophic Wind "
+                           << amrex::RealVect(abl_geo_forcing[0],abl_geo_forcing[1],abl_geo_forcing[2])
+                           << std::endl;
+        }
+
+        if (max_level > 0) {
+            amrex::Print() << "Coupling Type: " << std::endl;
+            if (coupling_type == CouplingType::TwoWay) {
+                amrex::Print() << "    Two-way" << std::endl;
+            } else if (coupling_type == CouplingType::OneWay) {
+                amrex::Print() << "    One-way" << std::endl;
+            }
         }
 
         amrex::Print() << "Buoyancy_type               : " << buoyancy_type << std::endl;
@@ -647,7 +648,7 @@ struct SolverChoice {
     // if     SAM, then it will be set to RhoQ4
     int RhoQr_comp {-1};
 
-    std::string windfarm_loc_table, windfarm_spec_table;
+    std::string windfarm_loc_table, windfarm_spec_table, windfarm_spec_table_extra;
     std::string windfarm_blade_table, windfarm_airfoil_tables;
     amrex::Real sampling_distance_by_D = -1.0;
     amrex::Real turb_disk_angle = -1.0;
diff --git a/Source/Diffusion/ERF_ComputeStrain_N.cpp b/Source/Diffusion/ERF_ComputeStrain_N.cpp
index 996851723..6fbafdd5c 100644
--- a/Source/Diffusion/ERF_ComputeStrain_N.cpp
+++ b/Source/Diffusion/ERF_ComputeStrain_N.cpp
@@ -208,8 +208,8 @@ ComputeStrain_N (Box bxcc, Box tbxxy, Box tbxxz, Box tbxyz, Box domain,
     //***********************************************************************************
     // Cell centered strains
     ParallelFor(bxcc, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-        tau11(i,j,k) = (u(i+1, j  , k  )/mf_u(i+1,j,0) - u(i, j, k)/mf_u(i,j,0))*dxInv[0]*mf_u(i,j,0)*mf_u(i,j,0);
-        tau22(i,j,k) = (v(i  , j+1, k  )/mf_v(i,j+1,0) - v(i, j, k)/mf_v(i,j,0))*dxInv[1]*mf_v(i,j,0)*mf_v(i,j,0);
+        tau11(i,j,k) = (u(i+1, j  , k  )/mf_u(i+1,j,0) - u(i, j, k)/mf_u(i,j,0))*dxInv[0]*mf_m(i,j,0)*mf_m(i,j,0);
+        tau22(i,j,k) = (v(i  , j+1, k  )/mf_v(i,j+1,0) - v(i, j, k)/mf_v(i,j,0))*dxInv[1]*mf_m(i,j,0)*mf_m(i,j,0);
         tau33(i,j,k) = (w(i  , j  , k+1) - w(i, j, k))*dxInv[2];
     });
 
diff --git a/Source/Diffusion/ERF_ComputeStrain_T.cpp b/Source/Diffusion/ERF_ComputeStrain_T.cpp
index 450d4317d..377a6a782 100644
--- a/Source/Diffusion/ERF_ComputeStrain_T.cpp
+++ b/Source/Diffusion/ERF_ComputeStrain_T.cpp
@@ -25,6 +25,7 @@ using namespace amrex;
  * @param[in] z_nd nodal array of physical z heights
  * @param[in] bc_ptr container with boundary condition types
  * @param[in] dxInv inverse cell size array
+ * @param[in] mf_m map factor at cell center
  * @param[in] mf_u map factor at x-face
  * @param[in] mf_v map factor at y-face
  */
@@ -38,7 +39,7 @@ ComputeStrain_T (Box bxcc, Box tbxxy, Box tbxxz, Box tbxyz, Box domain,
                  const Array4<const Real>& z_nd,
                  const Array4<const Real>& detJ,
                  const BCRec* bc_ptr, const GpuArray<Real, AMREX_SPACEDIM>& dxInv,
-                 const Array4<const Real>& /*mf_m*/,
+                 const Array4<const Real>& mf_m,
                  const Array4<const Real>& mf_u,
                  const Array4<const Real>& mf_v)
 {
@@ -452,9 +453,9 @@ ComputeStrain_T (Box bxcc, Box tbxxy, Box tbxxz, Box tbxyz, Box domain,
         met_h_zeta = detJ(i,j,k);
 
         tau11(i,j,k) = ( (u(i+1, j, k)/mf_u(i+1,j,0) - u(i, j, k)/mf_u(i,j,0))*dxInv[0]
-                       - (met_h_xi/met_h_zeta)*GradUz ) * mf_u(i,j,0)*mf_u(i,j,0);
+                       - (met_h_xi/met_h_zeta)*GradUz ) * mf_m(i,j,0)*mf_m(i,j,0);
         tau22(i,j,k) = ( (v(i, j+1, k)/mf_v(i,j+1,0) - v(i, j, k)/mf_v(i,j,0))*dxInv[1]
-                       - (met_h_eta/met_h_zeta)*GradVz ) * mf_v(i,j,0)*mf_v(i,j,0);
+                       - (met_h_eta/met_h_zeta)*GradVz ) * mf_m(i,j,0)*mf_m(i,j,0);
         tau33(i,j,k) = (w(i, j, k+1) - w(i, j, k))*dxInv[2]/met_h_zeta;
     });
 
diff --git a/Source/ERF.H b/Source/ERF.H
index 1d74e328a..9c07a779f 100644
--- a/Source/ERF.H
+++ b/Source/ERF.H
@@ -22,6 +22,10 @@
 #include <AMReX_YAFluxRegister.H>
 #include <AMReX_ErrorList.H>
 
+#ifdef ERF_USE_FFT
+#include <AMReX_FFT_Poisson.H>
+#endif
+
 #ifdef AMREX_MEM_PROFILING
 #include <AMReX_MemProfiler.H>
 #endif
@@ -40,6 +44,7 @@
 #include <ERF_MRI.H>
 #include <ERF_PhysBCFunct.H>
 #include <ERF_FillPatcher.H>
+#include <ERF_SampleData.H>
 
 #ifdef ERF_USE_PARTICLES
 #include "ERF_ParticleData.H"
@@ -66,6 +71,13 @@
 class MultiBlockContainer;
 #endif
 
+/**
+ * Enum of possible initialization types
+*/
+AMREX_ENUM(InitType,
+    None, Input_Sounding, Ideal, Real, Metgrid, Uniform
+);
+
 /**
  * Enum of possible coarse/fine interpolation options
 */
@@ -136,8 +148,6 @@ public:
 
     // Project the velocities to be divergence-free -- this is only relevant if anelastic == 1
     void project_velocities (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars, amrex::MultiFab& p);
-    void solve_with_heffte  (int lev, amrex::MultiFab& rhs, amrex::MultiFab& soln,
-                             amrex::Array<amrex::MultiFab,AMREX_SPACEDIM>& fluxes);
 
     // Project the velocities to be divergence-free with a thin body
     void project_velocities_tb (int lev, amrex::Real dt, amrex::Vector<amrex::MultiFab >& vars, amrex::MultiFab& p);
@@ -368,12 +378,10 @@ public:
     // Advance a block specified number of time steps
     void Evolve_MB (int MBstep, int max_block_step);
 
-    // Advance a block specified number of time steps
-    void Evolve_MB (MultiBlockContainer* mbc, int MBstep, int max_block_step);
-
-    // get the current time values
-    amrex::Real get_t_old() {return t_old[0];}
-    amrex::Real get_t_new() {return t_new[0];}
+    // get the current time values and dt
+    amrex::Real get_t_old (int lev) { return t_old[lev]; }
+    amrex::Real get_t_new (int lev) { return t_new[lev]; }
+    amrex::Real get_dt    (int lev) { return dt[lev];    }
 
     // Set parmparse prefix for MultiBlock
     void SetParmParsePrefix (std::string name) { pp_prefix = name; }
@@ -383,15 +391,14 @@ public:
 
     // Public data copy for MB
     std::vector<amrex::Box> domain_p;
-    MultiBlockContainer *m_mbc = nullptr;
     amrex::Vector<amrex::Vector<amrex::MultiFab> > vars_new;
     amrex::Vector<amrex::Vector<amrex::MultiFab> > vars_old;
 
     // Velocity time averaged field
     amrex::Vector<std::unique_ptr<amrex::MultiFab>> vel_t_avg;
     amrex::Vector<amrex::Real> t_avg_cnt;
-
 #endif
+
     std::string pp_prefix {"erf"};
 
     void fill_from_bndryregs (const amrex::Vector<amrex::MultiFab*>& mfs,
@@ -496,9 +503,6 @@ private:
                     const amrex::Vector<amrex::MultiFab*>& mfs_mom,
                     bool fillset=true, bool cons_only=false);
 
-    // Compute a new MultiFab by copying from valid region and filling ghost cells -
-    void FillPatchMoistVars (int lev, amrex::MultiFab& mf);
-
     // Compute new multifabs by copying data from valid region and filling ghost cells.
     // Unlike FillPatch, FillIntermediatePatch will use the supplied multifabs instead of fine level data.
     // This is to support filling boundary cells at an intermediate time between old/new times
@@ -947,9 +951,13 @@ private:
 
 #endif
 
+#ifdef ERF_USE_MULTIBLOCK
+    MultiBlockContainer *m_mbc = nullptr;
+#endif
+
     static int verbose;
     static int mg_verbose;
-    static bool use_heffte;
+    static bool use_fft;
 
     // Diagnostic output interval
     static int sum_interval;
@@ -959,13 +967,12 @@ private:
     // Native or NetCDF
     static std::string plotfile_type;
 
-    // init_type:  "ideal", "real", "input_sounding", "metgrid" or ""
-    static std::string init_type;
+    static InitType init_type;
 
     // sponge_type:  "input_sponge"
     static std::string sponge_type;
 
-    // use_real_bcs: only true if 1) ( (init_type == real) or (init_type == metgrid) )
+    // use_real_bcs: only true if 1) ( (init_type == InitType::Real) or (init_type == InitType::Metgrid) )
     //                        AND 2) we want to use the bc's from the WRF bdy file
     static bool use_real_bcs;
 
@@ -978,7 +985,7 @@ private:
     int real_set_width{0};
 
     // Flag to trigger initialization from input_sounding like WRF's ideal.exe
-    // used with init_type == "input_sounding"
+    // used with init_type == InitType::Input_Sounding
     static bool init_sounding_ideal;
 
     // Options for vertical interpolation of met_em*.nc data.
@@ -1272,6 +1279,11 @@ private:
         amrex::ParallelDescriptor::Barrier("ERF::setRecordSampleLineInfo");
     }
 
+    // Data sampler for line and plane output
+    int sampler_interval = -1;
+    amrex::Real sampler_per = -1.0;
+    std::unique_ptr<SampleData> data_sampler = nullptr;
+
     amrex::Vector<std::unique_ptr<std::fstream> > datalog;
     amrex::Vector<std::string> datalogname;
 
@@ -1313,6 +1325,10 @@ private:
     { return 4; }
 #endif
 
+#ifdef ERF_USE_FFT
+    std::unique_ptr<amrex::FFT::PoissonHybrid<amrex::MultiFab>> m_poisson;
+#endif
+
 public:
     void writeJobInfo (const std::string& dir) const;
     static void writeBuildInfo (std::ostream& os);
diff --git a/Source/ERF.cpp b/Source/ERF.cpp
index 0e2331b6f..1234c661a 100644
--- a/Source/ERF.cpp
+++ b/Source/ERF.cpp
@@ -14,14 +14,6 @@
 #include <ERF_TerrainMetrics.H>
 #include <memory>
 
-#ifdef ERF_USE_MULTIBLOCK
-#ifndef ERF_MB_EXTERN       // enter only if multiblock does not involve an external class
-#include <ERF_MultiBlockContainer.H>
-#else
-#include <MultiBlockContainer.H>
-#endif
-#endif
-
 using namespace amrex;
 
 Real ERF::startCPUTime        = 0.0;
@@ -40,7 +32,7 @@ int  ERF::fixed_mri_dt_ratio = 0;
 // Dictate verbosity in screen output
 int ERF::verbose        = 0;
 int  ERF::mg_verbose    = 0;
-bool ERF::use_heffte    = false;
+bool ERF::use_fft       = false;
 
 // Frequency of diagnostic output
 int  ERF::sum_interval  = -1;
@@ -51,10 +43,9 @@ int  ERF::pert_interval = -1;
 // Native AMReX vs NetCDF
 std::string ERF::plotfile_type    = "amrex";
 
-// init_type:  "uniform", "ideal", "real", "input_sounding", "metgrid" or ""
-std::string ERF::init_type;
+InitType ERF::init_type;
 
-// use_real_bcs: only true if 1) ( (init_type == real) or (init_type == metgrid) )
+// use_real_bcs: only true if 1) ( (init_type == InitType::Real) or (init_type == InitGrid::Metgrid) )
 //                        AND 2) we want to use the bc's from the WRF bdy file
 bool ERF::use_real_bcs;
 
@@ -547,6 +538,12 @@ ERF::post_timestep (int nstep, Real time, Real dt_lev0)
       }
     }
 
+    // Write plane/line sampler data
+    if (is_it_time_for_action(nstep, time, dt_lev0, sampler_interval, sampler_per) && (data_sampler) ) {
+        data_sampler->get_sample_data(geom, vars_new);
+        data_sampler->write_sample_data(t_new, istep, ref_ratio, geom);
+    }
+
     // Moving terrain
     if ( solverChoice.use_terrain &&  (solverChoice.terrain_type == TerrainType::Moving) )
     {
@@ -568,17 +565,6 @@ ERF::InitData ()
 {
     BL_PROFILE_VAR("ERF::InitData()", InitData);
     InitData_pre();
-#if 0
-#ifdef ERF_USE_MULTIBLOCK
-#ifndef ERF_MB_EXTERN       // enter only if multiblock does not involve an external class
-        // Multiblock: hook to set BL & comms once ba/dm are known
-        if(domain_p[0].bigEnd(0) < 500 ) {
-            m_mbc->SetBoxLists();
-            m_mbc->SetBlockCommMetaData();
-        }
-#endif
-#endif
-#endif
     InitData_post();
     BL_PROFILE_VAR_STOP(InitData);
 }
@@ -629,7 +615,7 @@ ERF::InitData_post ()
 {
     if (restart_chkfile.empty()) {
         if (solverChoice.use_terrain) {
-            if (init_type == "ideal") {
+            if (init_type == InitType::Ideal) {
                 Abort("We do not currently support init_type = ideal with terrain");
             }
         }
@@ -738,17 +724,6 @@ ERF::InitData_post ()
         for (int lev(0); lev <= max_level; ++lev) {
             make_physbcs(lev);
         }
-
-        // TODO: Check if this is needed. I don't think it is since we now
-        //       advect all the scalars...
-
-        // Need to fill ghost cells here since we will use this qmoist in advance
-        if (solverChoice.moisture_type != MoistureType::None)
-        {
-            for (int lev = 0; lev <= finest_level; lev++) {
-                if (qmoist[lev].size() > 0) FillPatchMoistVars(lev, *(qmoist[lev][0])); // qv component
-            }
-        }
     }
 
 #ifdef ERF_USE_PARTICLES
@@ -848,7 +823,7 @@ ERF::InitData_post ()
         solverChoice.rayleigh_damp_W ||solverChoice.rayleigh_damp_T)
     {
         initRayleigh();
-        if (init_type == "input_sounding")
+        if (init_type == InitType::Input_Sounding)
         {
             // Overwrite ubar, vbar, and thetabar with input profiles;
             // wbar is assumed to be 0. Note: the tau coefficient set by
@@ -1198,6 +1173,11 @@ ERF::InitData_post ()
 
     }
 
+    // Create object to do line and plane sampling if needed
+    bool do_line = false; bool do_plane = false;
+    pp.query("do_line_sampling",do_line); pp.query("do_plane_sampling",do_plane);
+    if (do_line || do_plane) { data_sampler = std::make_unique<SampleData>(do_line, do_plane); }
+
 #ifdef ERF_USE_EB
     bool write_eb_surface = false;
     pp.query("write_eb_surface", write_eb_surface);
@@ -1280,7 +1260,7 @@ ERF::init_only (int lev, Real time)
     // Map the words in the inputs file to BC types, then translate
     //     those types into what they mean for each variable
     // This must be called before initHSE (where the base state is initialized)
-    if (lev == 0 && init_type != "ideal") {
+    if (lev == 0 && init_type != InitType::Ideal) {
         init_bcs();
     }
 
@@ -1297,7 +1277,7 @@ ERF::init_only (int lev, Real time)
     lev_new[Vars::zvel].setVal(0.0); lev_old[Vars::zvel].setVal(0.0);
 
     // Initialize background flow (optional)
-    if (init_type == "input_sounding") {
+    if (init_type == InitType::Input_Sounding) {
         // The base state is initialized by integrating vertically through the
         // input sounding, if the init_sounding_ideal flag is set; otherwise
         // it is set by initHSE()
@@ -1318,23 +1298,23 @@ ERF::init_only (int lev, Real time)
         }
 
 #ifdef ERF_USE_NETCDF
-    } else if (init_type == "ideal" || init_type == "real") {
+    } else if (init_type == InitType::Ideal || init_type == InitType::Real) {
         // The base state is initialized from WRF wrfinput data, output by
         // ideal.exe or real.exe
         init_from_wrfinput(lev);
 
         // The physbc's need the terrain but are needed for initHSE
-        if (init_type == "ideal") {
+        if (init_type == InitType::Ideal) {
             make_physbcs(lev);
             initHSE(lev);
         }
 
-    } else if (init_type == "metgrid") {
+    } else if (init_type == InitType::Metgrid) {
         // The base state is initialized from data output by WPS metgrid;
         // we will rebalance after interpolation
         init_from_metgrid(lev);
 #endif
-    } else if (init_type == "uniform") {
+    } else if (init_type == InitType::Uniform) {
         // Initialize a uniform background field and base state based on the
         // problem-specified reference density and temperature
 
@@ -1421,7 +1401,7 @@ ERF::ReadParameters ()
         // Verbosity
         pp.query("v", verbose);
         pp.query("mg_v", mg_verbose);
-        pp.query("use_heffte", use_heffte);
+        pp.query("use_fft", use_fft);
 
         // Frequency of diagnostic output
         pp.query("sum_interval", sum_interval);
@@ -1449,11 +1429,12 @@ ERF::ReadParameters ()
         pp.query("fixed_mri_dt_ratio", fixed_mri_dt_ratio);
 
         // How to initialize
-        pp.query("init_type",init_type);
+        init_type = InitType::None;
+        pp.query_enum_case_insensitive("init_type",init_type);
 
         // Should we use the bcs we've read in from wrfbdy or metgrid files?
         // We default to yes if we have them, but the user can override that option
-        use_real_bcs = ( (init_type == "real") || (init_type == "metgrid") );
+        use_real_bcs = ( (init_type == InitType::Real) || (init_type == InitType::Metgrid) );
         pp.query("use_real_bcs",use_real_bcs);
 
         // We use this to keep track of how many boxes we read in from WRF initialization
@@ -1540,6 +1521,10 @@ ERF::ReadParameters ()
         pp.query("column_loc_y", column_loc_y);
         pp.query("column_file_name", column_file_name);
 
+        // Sampler output frequency
+        pp.query("sampler_per", sampler_per);
+        pp.query("sampler_interval", sampler_interval);
+
         // Specify information about outputting planes of data
         pp.query("output_bndry_planes", output_bndry_planes);
         pp.query("bndry_output_planes_interval", bndry_output_planes_interval);
@@ -1575,7 +1560,7 @@ ERF::ReadParameters ()
 
     // No moving terrain with init real (we must do this after init_params
     //    because that is where we set terrain_type
-    if (init_type == "real" && solverChoice.terrain_type == TerrainType::Moving) {
+    if (init_type == InitType::Real && solverChoice.terrain_type == TerrainType::Moving) {
         Abort("Moving terrain is not supported with init real");
     }
 
@@ -1587,6 +1572,11 @@ ERF::ReadParameters ()
     } else if (solverChoice.lsm_type == LandSurfaceType::MM5) {
         lsm.SetModel<MM5>();
         Print() << "MM5 land surface model!\n";
+#ifdef ERF_USE_NOAH
+    } else if (solverChoice.lsm_type == LandSurfaceType::NOAH) {
+        lsm.SetModel<NOAH>();
+        Print() << "NOAH land surface model!\n";
+#endif
     } else if (solverChoice.lsm_type == LandSurfaceType::None) {
         lsm.SetModel<NullSurf>();
         Print() << "Null land surface model!\n";
@@ -1607,8 +1597,8 @@ ERF::ParameterSanityChecks ()
 {
     AMREX_ALWAYS_ASSERT(cfl > 0. || fixed_dt[0] > 0.);
 
-    // We don't allow use_real_bcs to be true if init_type is not either real or metgrid
-    AMREX_ALWAYS_ASSERT(!use_real_bcs || ((init_type == "real") || (init_type == "metgrid")) );
+    // We don't allow use_real_bcs to be true if init_type is not either InitType::Rreal or InitType::Metgrid
+    AMREX_ALWAYS_ASSERT(!use_real_bcs || ((init_type == InitType::Real) || (init_type == InitType::Metgrid)) );
 
     AMREX_ALWAYS_ASSERT(real_width >= 0);
     AMREX_ALWAYS_ASSERT(real_set_width >= 0);
@@ -1635,17 +1625,6 @@ ERF::ParameterSanityChecks ()
         Abort("Dont know this plotfile_type");
     }
 
-    // Enforce the init_type is one we know
-    if (!init_type.empty() &&
-        init_type != "uniform" &&
-        init_type != "ideal" &&
-        init_type != "real" &&
-        init_type != "metgrid" &&
-        init_type != "input_sounding")
-    {
-        Error("if specified, init_type must be uniform, ideal, real, metgrid or input_sounding");
-    }
-
     // If fixed_mri_dt_ratio is set, it must be even
     if (fixed_mri_dt_ratio > 0 && (fixed_mri_dt_ratio%2 != 0) )
     {
@@ -1731,7 +1710,6 @@ ERF::MakeHorizontalAverages ()
             auto  fab_arr = mf.array(mfi);
             auto const  hse_arr = base_state[lev].const_array(mfi);
             auto const cons_arr = vars_new[lev][Vars::cons].const_array(mfi);
-            auto const   qv_arr = qmoist[lev][0]->const_array(mfi);
             int ncomp = vars_new[lev][Vars::cons].nComp();
 
             ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) {
@@ -1739,7 +1717,8 @@ ERF::MakeHorizontalAverages ()
                 if (is_anelastic) {
                     fab_arr(i,j,k,2) = hse_arr(i,j,k,1);
                 } else {
-                    fab_arr(i, j, k, 2) = getPgivenRTh(cons_arr(i, j, k, RhoTheta_comp), qv_arr(i,j,k));
+                    Real qv = cons_arr(i, j, k, RhoQ1_comp) / dens;
+                    fab_arr(i, j, k, 2) = getPgivenRTh(cons_arr(i, j, k, RhoTheta_comp), qv);
                 }
                 fab_arr(i, j, k, 3) = (ncomp > RhoQ1_comp ? cons_arr(i, j, k, RhoQ1_comp) / dens : 0.0);
                 fab_arr(i, j, k, 4) = (ncomp > RhoQ2_comp ? cons_arr(i, j, k, RhoQ2_comp) / dens : 0.0);
@@ -2025,86 +2004,6 @@ ERF::ERF (const RealBox& rb, int max_level_in,
 }
 #endif
 
-#ifdef ERF_USE_MULTIBLOCK
-// advance solution over specified block steps
-void
-ERF::Evolve_MB (int MBstep, int max_block_step)
-{
-    Real cur_time = t_new[0];
-
-    int step;
-
-    // Take one coarse timestep by calling timeStep -- which recursively calls timeStep
-    // for finer levels (with or without subcycling)
-    for (int Bstep(0); Bstep < max_block_step && cur_time < stop_time; ++Bstep)
-    {
-        step = Bstep + MBstep - 1;
-
-        Print() << "\nCoarse STEP " << step+1 << " starts ..." << std::endl;
-
-        ComputeDt(step);
-
-        // Make sure we have read enough of the boundary plane data to make it through this timestep
-        if (input_bndry_planes)
-        {
-            m_r2d->read_input_files(cur_time,dt[0],m_bc_extdir_vals);
-        }
-
-        int lev = 0;
-        int iteration = 1;
-        timeStep(lev, cur_time, iteration);
-
-#ifndef ERF_MB_EXTERN
-        // DEBUG
-        // Multiblock: hook for erf2 to fill from erf1
-        if(domain_p[0].bigEnd(0) < 500) {
-            for (int var_idx = 0; var_idx < Vars::NumTypes; ++var_idx)
-                m_mbc->FillPatchBlocks(var_idx,var_idx);
-        }
-#endif
-
-        cur_time  += dt[0];
-
-        Print() << "Coarse STEP " << step+1 << " ends." << " TIME = " << cur_time
-                       << " DT = " << dt[0]  << std::endl;
-
-        post_timestep(step, cur_time, dt[0]);
-
-        if (writeNow(cur_time, dt[0], step+1, m_plot_int_1, m_plot_per_1)) {
-            last_plot_file_step_1 = step+1;
-            WritePlotFile(1,plot_var_names_1);
-        }
-
-        if (writeNow(cur_time, dt[0], step+1, m_plot_int_2, m_plot_per_2)) {
-            last_plot_file_step_2 = step+1;
-            WritePlotFile(2,plot_var_names_2);
-        }
-
-        if (writeNow(cur_time, dt[0], step+1, m_check_int, m_check_per)) {
-            last_check_file_step = step+1;
-#ifdef ERF_USE_NETCDF
-            if (check_type == "netcdf") {
-               WriteNCCheckpointFile();
-            }
-#endif
-            if (check_type == "native") {
-               WriteCheckpointFile();
-            }
-        }
-
-#ifdef AMREX_MEM_PROFILING
-        {
-            std::ostringstream ss;
-            ss << "[STEP " << step+1 << "]";
-            MemProfiler::report(ss.str());
-        }
-#endif
-
-        if (cur_time >= stop_time - 1.e-6*dt[0]) break;
-    }
-}
-#endif
-
 bool
 ERF::writeNow(const Real cur_time, const Real dt_lev, const int nstep, const int plot_int, const Real plot_per)
 {
diff --git a/Source/ERF_Tagging.cpp b/Source/ERF_Tagging.cpp
index 5bfb5f08c..2203ac2bb 100644
--- a/Source/ERF_Tagging.cpp
+++ b/Source/ERF_Tagging.cpp
@@ -223,7 +223,7 @@ ERF::refinement_criteria_setup ()
                     boxes_at_level[lev_for_box].push_back(bx);
                     Print() << "Saving in 'boxes at level' as " << bx << std::endl;
                 } // lev
-                if (init_type == "real" || init_type == "metgrid") {
+                if (init_type == InitType::Real || init_type == InitType::Metgrid) {
                     if (num_boxes_at_level[lev_for_box] != num_files_at_level[lev_for_box]) {
                         amrex::Error("Number of boxes doesn't match number of input files");
 
@@ -261,7 +261,7 @@ ERF::refinement_criteria_setup ()
                     boxes_at_level[lev_for_box].push_back(bx);
                     Print() << "Saving in 'boxes at level' as " << bx << std::endl;
                 } // lev
-                if (init_type == "real" || init_type == "metgrid") {
+                if (init_type == InitType::Real || init_type == InitType::Metgrid) {
                     if (num_boxes_at_level[lev_for_box] != num_files_at_level[lev_for_box]) {
                         amrex::Error("Number of boxes doesn't match number of input files");
 
diff --git a/Source/ERF_make_new_arrays.cpp b/Source/ERF_make_new_arrays.cpp
index ed9f652bd..78a249141 100644
--- a/Source/ERF_make_new_arrays.cpp
+++ b/Source/ERF_make_new_arrays.cpp
@@ -147,6 +147,9 @@ ERF::init_stuff (int lev, const BoxArray& ba, const DistributionMapping& dm,
     rW_new[lev].define(convert(ba, IntVect(0,0,1)), dm, 1, ngrow_vels);
 
     // We do this here just so they won't be undefined in the initial FillPatch
+    rU_old[lev].setVal(1.2e21);
+    rV_old[lev].setVal(3.4e22);
+    rW_old[lev].setVal(5.6e23);
     rU_new[lev].setVal(1.2e21);
     rV_new[lev].setVal(3.4e22);
     rW_new[lev].setVal(5.6e23);
@@ -236,7 +239,7 @@ ERF::init_stuff (int lev, const BoxArray& ba, const DistributionMapping& dm,
         vars_windfarm[lev].define(ba, dm, 2, ngrow_state);// dudt, dvdt
     }
     if (solverChoice.windfarm_type == WindFarmType::GeneralAD) {
-        vars_windfarm[lev].define(ba, dm, 2, ngrow_state);// dudt, dvdt
+        vars_windfarm[lev].define(ba, dm, 3, ngrow_state);// dudt, dvdt, dwdt
     }
         Nturb[lev].define(ba, dm, 1, ngrow_state); // Number of turbines in a cell
         SMark[lev].define(ba, dm, 2, ngrow_state); // Free stream velocity/source term
@@ -442,7 +445,7 @@ ERF::init_zphys (int lev, Real time)
 {
     if (solverChoice.use_terrain)
     {
-        if (init_type != "real" && init_type != "metgrid")
+        if (init_type != InitType::Real && init_type != InitType::Metgrid)
         {
             if (lev > 0) {
                 //
diff --git a/Source/ERF_make_new_level.cpp b/Source/ERF_make_new_level.cpp
index 2de79276f..64e458e80 100644
--- a/Source/ERF_make_new_level.cpp
+++ b/Source/ERF_make_new_level.cpp
@@ -26,7 +26,7 @@ void ERF::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& ba_in,
     BoxArray ba;
     DistributionMapping dm;
     Box domain(Geom(0).Domain());
-    if (lev == 0 &&
+    if (lev == 0 && restart_chkfile.empty() &&
         (max_grid_size[0][0] >= domain.length(0)) &&
         (max_grid_size[0][1] >= domain.length(1)) &&
         ba_in.size() != ParallelDescriptor::NProcs())
@@ -47,7 +47,7 @@ void ERF::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& ba_in,
     // Define dmap[lev] to be dm
     SetDistributionMap(lev, dm);
 
-    // amrex::Print() <<" BA FROM SCRATCH AT LEVEL " << lev << " " << ba << std::endl;
+    amrex::Print() <<" BA FROM SCRATCH AT LEVEL " << lev << " " << ba << std::endl;
 
     if (lev == 0) init_bcs();
 
@@ -113,14 +113,14 @@ void ERF::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& ba_in,
 
     // ********************************************************************************************
     // Initialize the data itself
-    // If (init_type == "real") then we are initializing terrain and the initial data in
-    //                          the same call so we must call init_only before update_terrain_arrays
-    // If (init_type != "real") then we want to initialize the terrain before the initial data
-    //                          since we may need to use the grid information before constructing
-    //                          initial idealized data
+    // If (init_type == InitType::Real) then we are initializing terrain and the initial data in
+    //                                  the same call so we must call init_only before update_terrain_arrays
+    // If (init_type != InitType::Real) then we want to initialize the terrain before the initial data
+    //                                  since we may need to use the grid information before constructing
+    //                                  initial idealized data
     // ********************************************************************************************
     if (restart_chkfile.empty()) {
-        if ((init_type == "real") || (init_type == "metgrid")) {
+        if ((init_type == InitType::Real) || (init_type == InitType::Metgrid)) {
             init_only(lev, start_time);
             init_zphys(lev, time);
             update_terrain_arrays(lev);
@@ -128,7 +128,7 @@ void ERF::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& ba_in,
         } else {
             init_zphys(lev, time);
             update_terrain_arrays(lev);
-            // Note that for init_type != real or metgrid,
+            // Note that for init_type != InitType::Real or InitType::Metgrid,
             // make_physbcs is called inside init_only
             init_only(lev, start_time);
         }
@@ -283,7 +283,7 @@ ERF::MakeNewLevelFromCoarse (int lev, Real time, const BoxArray& ba,
 void
 ERF::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMapping& dm)
 {
-    // amrex::Print() <<" REMAKING WITH NEW BA AT LEVEL " << lev << " " << ba << std::endl;
+    amrex::Print() <<" REMAKING WITH NEW BA AT LEVEL " << lev << " " << ba << std::endl;
 
     AMREX_ALWAYS_ASSERT(lev > 0);
     AMREX_ALWAYS_ASSERT(solverChoice.terrain_type != TerrainType::Moving);
@@ -332,9 +332,10 @@ ERF::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMapp
     // *****************************************************************************************************
     make_physbcs(lev);
 
-    // ********************************************************************************************
+    // *************************************************************************************************
     // This will fill the temporary MultiFabs with data from vars_new
-    // ********************************************************************************************
+    // NOTE: the momenta here are only used as scratch space, the momenta themselves are not fillpatched
+    // *************************************************************************************************
     FillPatch(lev, time, {&temp_lev_new[Vars::cons],&temp_lev_new[Vars::xvel],
                           &temp_lev_new[Vars::yvel],&temp_lev_new[Vars::zvel]},
                          {&temp_lev_new[Vars::cons],&rU_new[lev],&rV_new[lev],&rW_new[lev]},
diff --git a/Source/ERF_prob_common.H b/Source/ERF_prob_common.H
index 070835c63..b9318abc2 100644
--- a/Source/ERF_prob_common.H
+++ b/Source/ERF_prob_common.H
@@ -468,7 +468,7 @@ protected:
 
     /**
      * Function to update default base parameters, currently only used for
-     * init_type=='uniform'
+     * init_type == InitType::Uniform
      */
     void init_base_parms (amrex::Real rho_0, amrex::Real T_0) {
         base_parms.rho_0 = rho_0;
diff --git a/Source/IO/ERF_Checkpoint.cpp b/Source/IO/ERF_Checkpoint.cpp
index e67032d59..7a3452a51 100644
--- a/Source/IO/ERF_Checkpoint.cpp
+++ b/Source/IO/ERF_Checkpoint.cpp
@@ -221,7 +221,7 @@ ERF::WriteCheckpointFile () const
 
 #ifdef ERF_USE_NETCDF
    // Write bdy_data files
-   if (ParallelDescriptor::IOProcessor() && ((init_type=="real") || (init_type=="metgrid"))) {
+   if (ParallelDescriptor::IOProcessor() && ((init_type==InitType::Real) || (init_type==InitType::Metgrid))) {
 
      // Vector dimensions
      int num_time = bdy_data_xlo.size();
@@ -462,7 +462,7 @@ ERF::ReadCheckpointFile ()
 
 #ifdef ERF_USE_NETCDF
     // Read bdy_data files
-    if ((init_type=="real") || (init_type=="metgrid")) {
+    if ((init_type==InitType::Real) || (init_type==InitType::Metgrid)) {
         int ioproc = ParallelDescriptor::IOProcessorNumber();  // I/O rank
         int num_time;
         int num_var;
diff --git a/Source/IO/ERF_Plotfile.cpp b/Source/IO/ERF_Plotfile.cpp
index 5bdcf5831..75557acd4 100644
--- a/Source/IO/ERF_Plotfile.cpp
+++ b/Source/IO/ERF_Plotfile.cpp
@@ -101,13 +101,9 @@ ERF::setPlotVariables (const std::string& pp_plot_var_names, Vector<std::string>
                     tmp_plot_names.push_back(derived_names[i]);
                 }
             }
-            if(solverChoice.windfarm_type == WindFarmType::SimpleAD) {
-                if(derived_names[i] == "num_turb" or derived_names[i] == "SMark0" or derived_names[i] == "Smark1") {
-                    tmp_plot_names.push_back(derived_names[i]);
-                }
-            }
-            if(solverChoice.windfarm_type == WindFarmType::GeneralAD) {
-                if(derived_names[i] == "num_turb" or derived_names[i] == "SMark1") {
+            if( solverChoice.windfarm_type == WindFarmType::SimpleAD or
+                solverChoice.windfarm_type == WindFarmType::GeneralAD ) {
+                if(derived_names[i] == "num_turb" or derived_names[i] == "SMark0" or derived_names[i] == "SMark1") {
                     tmp_plot_names.push_back(derived_names[i]);
                 }
             }
@@ -199,6 +195,7 @@ ERF::WritePlotFile (int which, Vector<std::string> plot_var_names)
     // We Fillpatch here because some of the derived quantities require derivatives
     //     which require ghost cells to be filled.  We do not need to call FillPatcher
     //     because we don't need to set interior fine points.
+    // NOTE: the momenta here are only used as scratch space, the momenta themselves are not fillpatched
     for (int lev = 0; lev <= finest_level; ++lev) {
         bool fillset = false;
         FillPatch(lev, t_new[lev], {&vars_new[lev][Vars::cons], &vars_new[lev][Vars::xvel],
@@ -485,8 +482,8 @@ ERF::WritePlotFile (int which, Vector<std::string> plot_var_names)
             mf_comp ++;
         }
 
-        if(containerHasElement(plot_var_names, "SMark0") and
-           solverChoice.windfarm_type == WindFarmType::SimpleAD) {
+        if( containerHasElement(plot_var_names, "SMark0") and
+           (solverChoice.windfarm_type == WindFarmType::SimpleAD or solverChoice.windfarm_type == WindFarmType::GeneralAD) ) {
              for ( MFIter mfi(mf[lev],TilingIfNotGPU()); mfi.isValid(); ++mfi)
             {
                 const Box& bx = mfi.tilebox();
@@ -1511,10 +1508,10 @@ ERF::WriteMultiLevelPlotfileWithTerrain (const std::string& plotfilename, int nl
                                          const Vector<const MultiFab*>& mf,
                                          const Vector<const MultiFab*>& mf_nd,
                                          const Vector<std::string>& varnames,
-                                         const Vector<Geometry>& geom,
+                                         const Vector<Geometry>& my_geom,
                                          Real time,
                                          const Vector<int>& level_steps,
-                                         const Vector<IntVect>& ref_ratio,
+                                         const Vector<IntVect>& rr,
                                          const std::string &versionName,
                                          const std::string &levelPrefix,
                                          const std::string &mfPrefix,
@@ -1523,7 +1520,7 @@ ERF::WriteMultiLevelPlotfileWithTerrain (const std::string& plotfilename, int nl
     BL_PROFILE("WriteMultiLevelPlotfileWithTerrain()");
 
     AMREX_ALWAYS_ASSERT(nlevels <= mf.size());
-    AMREX_ALWAYS_ASSERT(nlevels <= ref_ratio.size()+1);
+    AMREX_ALWAYS_ASSERT(nlevels <= rr.size()+1);
     AMREX_ALWAYS_ASSERT(nlevels <= level_steps.size());
     AMREX_ALWAYS_ASSERT(mf[0]->nComp() == varnames.size());
 
@@ -1553,7 +1550,7 @@ ERF::WriteMultiLevelPlotfileWithTerrain (const std::string& plotfilename, int nl
                                                     std::ofstream::binary);
             if( ! HeaderFile.good()) FileOpenFailed(HeaderFileName);
             WriteGenericPlotfileHeaderWithTerrain(HeaderFile, nlevels, boxArrays, varnames,
-                                                  geom, time, level_steps, ref_ratio, versionName,
+                                                  my_geom, time, level_steps, rr, versionName,
                                                   levelPrefix, mfPrefix);
         };
 
diff --git a/Source/IO/ERF_SampleData.H b/Source/IO/ERF_SampleData.H
new file mode 100644
index 000000000..30ddd9454
--- /dev/null
+++ b/Source/IO/ERF_SampleData.H
@@ -0,0 +1,412 @@
+#ifndef ERF_SAMPLEDATA_H
+#define ERF_SAMPLEDATA_H
+
+#include <memory>
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_MultiFabUtil.H>
+#include <AMReX_PlotFileUtil.H>
+
+#include <ERF_IndexDefines.H>
+
+struct LineSampler
+{
+    LineSampler ()
+    {
+        amrex::ParmParse pp("erf");
+
+        // Count number of lo and hi points define the line
+        int n_line_lo  = pp.countval("sample_line_lo") / AMREX_SPACEDIM;
+        int n_line_hi  = pp.countval("sample_line_hi") / AMREX_SPACEDIM;
+        int n_line_dir = pp.countval("sample_line_dir");
+        AMREX_ALWAYS_ASSERT( (n_line_lo==n_line_hi ) &&
+                             (n_line_lo==n_line_dir) );
+
+        // Parse the data
+        if (n_line_lo > 0) {
+            // Parse lo
+            amrex::Vector<int> idx_lo; idx_lo.resize(n_line_lo*AMREX_SPACEDIM);
+            amrex::Vector<amrex::IntVect> iv_lo; iv_lo.resize(n_line_lo);
+            pp.queryarr("sample_line_lo",idx_lo,0,n_line_lo*AMREX_SPACEDIM);
+            for (int i(0); i < n_line_lo; i++) {
+                amrex::IntVect iv(idx_lo[AMREX_SPACEDIM*i+0],
+                                  idx_lo[AMREX_SPACEDIM*i+1],
+                                  idx_lo[AMREX_SPACEDIM*i+2]);
+                iv_lo[i] = iv;
+            }
+
+            // Parse hi
+            amrex::Vector<int> idx_hi; idx_hi.resize(n_line_hi*AMREX_SPACEDIM);
+            amrex::Vector<amrex::IntVect> iv_hi; iv_hi.resize(n_line_hi);
+            pp.queryarr("sample_line_hi",idx_hi,0,n_line_hi*AMREX_SPACEDIM);
+            for (int i(0); i < n_line_hi; i++) {
+                amrex::IntVect iv(idx_hi[AMREX_SPACEDIM*i+0],
+                                  idx_hi[AMREX_SPACEDIM*i+1],
+                                  idx_hi[AMREX_SPACEDIM*i+2]);
+                iv_hi[i] = iv;
+            }
+
+            // Construct vector of bounding boxes
+            m_bnd_bx.resize(n_line_lo);
+            for (int i = 0; i < n_line_hi; i++){
+                amrex::Box lbx(iv_lo[i],iv_hi[i]);
+                m_bnd_bx[i] = lbx;
+            }
+
+            // Parse directionality
+            m_dir.resize(n_line_dir);
+            pp.queryarr("sample_line_dir",m_dir,0,n_line_dir);
+
+            // Allocate space for level indicator
+            m_lev.resize(n_line_dir,0);
+
+            // Allocate space for MF pointers
+            m_ls_mf.resize(n_line_lo);
+        }
+    }
+
+    void
+    get_line_mfs (amrex::Vector<amrex::Vector<amrex::MultiFab>>& vars_new)
+    {
+        int nlev  = vars_new.size();
+        int nline = m_bnd_bx.size();
+        int ncomp  = 2;
+
+        // Loop over each line
+        for (int iline(0); iline<nline; ++iline) {
+            int dir = m_dir[iline];
+            amrex::Box bnd_bx   = m_bnd_bx[iline];
+            amrex::IntVect cell = bnd_bx.smallEnd();
+
+            // Search each level to get the finest data possible
+            for (int ilev(nlev-1); ilev>=0; --ilev) {
+
+                // Construct CC velocities
+                amrex::MultiFab mf_cc_vel;
+                auto ba = vars_new[ilev][Vars::cons].boxArray();
+                auto dm = vars_new[ilev][Vars::cons].DistributionMap();
+                mf_cc_vel.define(ba, dm, AMREX_SPACEDIM, amrex::IntVect(1,1,1));
+                average_face_to_cellcenter(mf_cc_vel,0,
+                                           amrex::Array<const amrex::MultiFab*,3>{&vars_new[ilev][Vars::xvel],
+                                                                                  &vars_new[ilev][Vars::yvel],
+                                                                                  &vars_new[ilev][Vars::zvel]});
+
+                // Construct vector of MFs holding T and WSP
+                amrex::MultiFab mf_cc_data;
+                mf_cc_data.define(ba, dm, ncomp, 1);
+#ifdef _OPENMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+                for (amrex::MFIter mfi(mf_cc_data, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                    const amrex::Box& tbx = mfi.tilebox();
+                    auto const& dfab = mf_cc_data.array(mfi);
+                    auto const& tfab = vars_new[ilev][Vars::cons].array(mfi);
+                    auto const& wfab = mf_cc_vel.array(mfi);
+                    amrex::ParallelFor(tbx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+                    {
+                        dfab(i,j,k,0) = tfab(i,j,k,1)/tfab(i,j,k,0);
+                        dfab(i,j,k,1) = std::sqrt(wfab(i,j,k,0)*wfab(i,j,k,0)
+                                                + wfab(i,j,k,1)*wfab(i,j,k,1)
+                                                + wfab(i,j,k,2)*wfab(i,j,k,2)) ;
+                    });
+
+                }
+
+                  m_lev[iline] = ilev;
+                m_ls_mf[iline] = get_line_data(mf_cc_data, dir, cell, bnd_bx);
+
+                // We can stop if we got the entire line
+                auto min_bnd_bx = m_ls_mf[iline].boxArray().minimalBox();
+                if (bnd_bx == min_bnd_bx) { continue; }
+
+            } // ilev
+        }// iline
+    }
+
+    void
+    write_line_mfs (amrex::Vector<amrex::Real>& time,
+                    amrex::Vector<int>& level_steps,
+                    amrex::Vector<amrex::IntVect>& ref_ratio,
+                    amrex::Vector<amrex::Geometry>& geom)
+    {
+        std::string name_base = "plt_line_";
+        amrex::Vector<std::string> varnames = {"T", "Wsp"};
+
+        int nline = m_ls_mf.size();
+        for (int iline(0); iline<nline; ++iline) {
+            // Data members that can be used as-is
+            int dir = m_dir[iline];
+            int lev = m_lev[iline];
+            amrex::Real m_time = time[lev];
+            amrex::Vector<int> m_level_steps = {level_steps[lev]};
+            amrex::Vector<amrex::IntVect> m_ref_ratio  = {ref_ratio[lev]};
+
+            // Create modified geometry object corresponding to the line
+            auto plo = geom[lev].ProbLo();
+            auto dx  = geom[lev].CellSize();
+            amrex::Vector<amrex::Geometry> m_geom; m_geom.resize(1);
+            amrex::Vector<int> is_per(AMREX_SPACEDIM,0);
+            amrex::Box m_dom = m_bnd_bx[iline];
+            amrex::RealBox m_rb;
+            for (int d(0); d<AMREX_SPACEDIM; ++d) {
+                amrex::Real offset = (d==dir) ? 0 : 0.5;
+                amrex::Real lo = plo[d] + ( m_dom.smallEnd(d) - offset ) * dx[d];
+                amrex::Real hi = plo[d] + ( m_dom.bigEnd(d)   + offset ) * dx[d];
+
+                m_rb.setLo(d,lo);
+                m_rb.setHi(d,hi);
+
+                is_per[d] = geom[lev].isPeriodic(d);
+            }
+            m_geom[0].define(m_dom, &m_rb, geom[lev].Coord(), is_per.data());
+
+            // Create plotfile name
+            std::string name_line    = amrex::Concatenate(name_base, iline           , 5);
+            name_line += "_step_";
+            std::string plotfilename = amrex::Concatenate(name_line, m_level_steps[0], 5);
+
+            // Get the data
+            amrex::Vector<const amrex::MultiFab*> mf = {&(m_ls_mf[iline])};
+
+            // Write each line
+            WriteMultiLevelPlotfile(plotfilename, 1, mf,
+                                    varnames, m_geom, m_time,
+                                    m_level_steps, m_ref_ratio);
+        }
+    }
+
+    amrex::Vector<int> m_dir;
+    amrex::Vector<int> m_lev;
+    amrex::Vector<amrex::Box> m_bnd_bx;
+    amrex::Vector<amrex::MultiFab> m_ls_mf;
+};
+
+
+struct PlaneSampler
+{
+    PlaneSampler ()
+    {
+        amrex::ParmParse pp("erf");
+
+        // Count number of lo and hi points define the plane
+        int n_plane_lo  = pp.countval("sample_plane_lo") / AMREX_SPACEDIM;
+        int n_plane_hi  = pp.countval("sample_plane_hi") / AMREX_SPACEDIM;
+        int n_plane_dir = pp.countval("sample_plane_dir");
+        AMREX_ALWAYS_ASSERT( (n_plane_lo==n_plane_hi ) &&
+                             (n_plane_lo==n_plane_dir) );
+
+        // Parse the data
+        if (n_plane_lo > 0) {
+            // Parse lo
+            amrex::Vector<amrex::Real> r_lo; r_lo.resize(n_plane_lo*AMREX_SPACEDIM);
+            amrex::Vector<amrex::Vector<amrex::Real>> rv_lo;
+            pp.queryarr("sample_plane_lo",r_lo,0,n_plane_lo*AMREX_SPACEDIM);
+            for (int i(0); i < n_plane_lo; i++) {
+                amrex::Vector<amrex::Real> rv = {r_lo[AMREX_SPACEDIM*i+0],
+                                                 r_lo[AMREX_SPACEDIM*i+1],
+                                                 r_lo[AMREX_SPACEDIM*i+2]};
+                rv_lo.push_back(rv);
+            }
+
+            // Parse hi
+            amrex::Vector<amrex::Real> r_hi; r_hi.resize(n_plane_hi*AMREX_SPACEDIM);
+            amrex::Vector<amrex::Vector<amrex::Real>> rv_hi;
+            pp.queryarr("sample_plane_hi",r_hi,0,n_plane_hi*AMREX_SPACEDIM);
+            for (int i(0); i < n_plane_hi; i++) {
+                amrex::Vector<amrex::Real> rv = {r_hi[AMREX_SPACEDIM*i+0],
+                                                 r_hi[AMREX_SPACEDIM*i+1],
+                                                 r_hi[AMREX_SPACEDIM*i+2]};
+                rv_hi.push_back(rv);
+            }
+
+            // Construct vector of bounding real boxes
+            m_bnd_rbx.resize(n_plane_lo);
+            for (int i(0); i < n_plane_hi; i++){
+                amrex::RealBox rbx(rv_lo[i].data(),rv_hi[i].data());
+                m_bnd_rbx[i] = rbx;
+            }
+
+            // Parse directionality
+            m_dir.resize(n_plane_dir);
+            pp.queryarr("sample_plane_dir",m_dir,0,n_plane_dir);
+
+            // Allocate space for level indicator
+            m_lev.resize(n_plane_dir,0);
+
+            // Allocate space for MF pointers
+            m_ps_mf.resize(n_plane_lo);
+        }
+    }
+
+    // This must match what is in AMReX_MultiFabUtil.H
+    amrex::Box
+    getIndexBox (const amrex::RealBox& real_box,
+                 const amrex::Geometry& geom) {
+        amrex::IntVect slice_lo, slice_hi;
+
+        AMREX_D_TERM(slice_lo[0]=static_cast<int>(std::floor((real_box.lo(0) - geom.ProbLo(0))/geom.CellSize(0)));,
+                     slice_lo[1]=static_cast<int>(std::floor((real_box.lo(1) - geom.ProbLo(1))/geom.CellSize(1)));,
+                     slice_lo[2]=static_cast<int>(std::floor((real_box.lo(2) - geom.ProbLo(2))/geom.CellSize(2))););
+
+        AMREX_D_TERM(slice_hi[0]=static_cast<int>(std::floor((real_box.hi(0) - geom.ProbLo(0))/geom.CellSize(0)));,
+                     slice_hi[1]=static_cast<int>(std::floor((real_box.hi(1) - geom.ProbLo(1))/geom.CellSize(1)));,
+                     slice_hi[2]=static_cast<int>(std::floor((real_box.hi(2) - geom.ProbLo(2))/geom.CellSize(2))););
+
+        return amrex::Box(slice_lo, slice_hi) & geom.Domain();
+    }
+
+    void
+    get_plane_mfs (amrex::Vector<amrex::Geometry>& geom,
+                   amrex::Vector<amrex::Vector<amrex::MultiFab>>& vars_new)
+    {
+        int nlev   = vars_new.size();
+        int nplane = m_bnd_rbx.size();
+        int ncomp  = 2;
+        bool interpolate = true;
+
+        // Loop over each plane
+        for (int iplane(0); iplane<nplane; ++iplane) {
+            int dir = m_dir[iplane];
+            amrex::RealBox bnd_rbx  = m_bnd_rbx[iplane];
+            amrex::Real point = bnd_rbx.lo(dir);
+
+            // Search each level to get the finest data possible
+            for (int ilev(nlev-1); ilev>=0; --ilev) {
+
+                // Construct CC velocities
+                amrex::MultiFab mf_cc_vel;
+                auto ba = vars_new[ilev][Vars::cons].boxArray();
+                auto dm = vars_new[ilev][Vars::cons].DistributionMap();
+                mf_cc_vel.define(ba, dm, AMREX_SPACEDIM, amrex::IntVect(1,1,1));
+                average_face_to_cellcenter(mf_cc_vel,0,
+                                           amrex::Array<const amrex::MultiFab*,3>{&vars_new[ilev][Vars::xvel],
+                                                                                  &vars_new[ilev][Vars::yvel],
+                                                                                  &vars_new[ilev][Vars::zvel]});
+
+                // Construct vector of MFs holding T and WSP
+                amrex::MultiFab mf_cc_data;
+                mf_cc_data.define(ba, dm, ncomp, 1);
+#ifdef _OPENMP
+#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
+#endif
+                for (amrex::MFIter mfi(mf_cc_data, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                    const amrex::Box& tbx = mfi.tilebox();
+                    auto const& dfab = mf_cc_data.array(mfi);
+                    auto const& tfab = vars_new[ilev][Vars::cons].array(mfi);
+                    auto const& wfab = mf_cc_vel.array(mfi);
+                    amrex::ParallelFor(tbx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+                    {
+                        dfab(i,j,k,0) = tfab(i,j,k,1)/tfab(i,j,k,0);
+                        dfab(i,j,k,1) = std::sqrt(wfab(i,j,k,0)*wfab(i,j,k,0)
+                                                + wfab(i,j,k,1)*wfab(i,j,k,1)
+                                                + wfab(i,j,k,2)*wfab(i,j,k,2)) ;
+                    });
+
+                }
+
+                  m_lev[iplane] = ilev;
+                m_ps_mf[iplane] = get_slice_data(dir, point, mf_cc_data, geom[ilev],
+                                                 0, ncomp, interpolate, bnd_rbx);
+
+                // We can stop if we got the entire plane
+                auto min_bnd_bx   = m_ps_mf[iplane]->boxArray().minimalBox();
+                amrex::Box bnd_bx = getIndexBox(bnd_rbx, geom[ilev]);
+                if (bnd_bx == min_bnd_bx) { continue; }
+
+            } // ilev
+        }// iplane
+    }
+
+    void
+    write_plane_mfs (amrex::Vector<amrex::Real>& time,
+                     amrex::Vector<int>& level_steps,
+                     amrex::Vector<amrex::IntVect>& ref_ratio,
+                     amrex::Vector<amrex::Geometry>& geom)
+    {
+        std::string name_base = "plt_plane_";
+        amrex::Vector<std::string> varnames = {"T", "Wsp"};
+
+        int nplane = m_ps_mf.size();
+        for (int iplane(0); iplane<nplane; ++iplane) {
+            // Data members that can be used as-is
+            int dir = m_dir[iplane];
+            int lev = m_lev[iplane];
+            amrex::Real m_time = time[lev];
+            amrex::Vector<int> m_level_steps = {level_steps[lev]};
+            amrex::Vector<amrex::IntVect> m_ref_ratio  = {ref_ratio[lev]};
+
+            // Create modified geometry object corresponding to the plane
+            amrex::RealBox m_rb = m_bnd_rbx[iplane];
+            amrex::Box m_dom    = getIndexBox(m_rb, geom[lev]);
+            amrex::Real point   = m_rb.hi(dir);
+            amrex::Vector<int> is_per(AMREX_SPACEDIM,0);
+            for (int d(0); d<AMREX_SPACEDIM; ++d) {
+                if (d==dir) {
+                    m_rb.setLo(d,point-0.5*geom[lev].CellSize(d));
+                    m_rb.setHi(d,point+0.5*geom[lev].CellSize(d));
+                }
+                is_per[d] = geom[lev].isPeriodic(d);
+            }
+            amrex::Vector<amrex::Geometry> m_geom; m_geom.resize(1);
+            m_geom[0].define(m_dom, &m_rb, geom[lev].Coord(), is_per.data());
+
+            // Create plotfile name
+            std::string name_plane   = amrex::Concatenate(name_base, iplane           , 5);
+            name_plane += "_step_";
+            std::string plotfilename = amrex::Concatenate(name_plane, m_level_steps[0], 5);
+
+            // Get the data
+            amrex::Vector<const amrex::MultiFab*> mf = {m_ps_mf[iplane].get()};
+
+            // Write each plane
+            WriteMultiLevelPlotfile(plotfilename, 1, mf,
+                                    varnames, m_geom, m_time,
+                                    m_level_steps, m_ref_ratio);
+        } // iplane
+    }
+
+    amrex::Vector<int> m_dir;
+    amrex::Vector<int> m_lev;
+    amrex::Vector<amrex::RealBox> m_bnd_rbx;
+    amrex::Vector<std::unique_ptr<amrex::MultiFab>> m_ps_mf;
+};
+
+
+class SampleData
+{
+public:
+    explicit SampleData (bool do_line=false,
+                         bool do_plane=false)
+    {
+        if(do_line)  m_ls = std::make_unique<LineSampler >();
+        if(do_plane) m_ps = std::make_unique<PlaneSampler>();
+    }
+
+    void
+    get_sample_data (amrex::Vector<amrex::Geometry>& geom,
+                     amrex::Vector<amrex::Vector<amrex::MultiFab>>& vars_new)
+    {
+        if (m_ls) m_ls->get_line_mfs(vars_new);
+        if (m_ps) m_ps->get_plane_mfs(geom, vars_new);
+    }
+
+    void
+    write_sample_data (amrex::Vector<amrex::Real>& time,
+                       amrex::Vector<int>& level_steps,
+                       amrex::Vector<amrex::IntVect>& ref_ratio,
+                       amrex::Vector<amrex::Geometry>& geom)
+    {
+        if (m_ls) m_ls->write_line_mfs(time, level_steps, ref_ratio, geom);
+        if (m_ls) m_ps->write_plane_mfs(time, level_steps, ref_ratio, geom);
+    }
+
+private:
+
+    // Structures for getting line MFs
+    std::unique_ptr<LineSampler> m_ls = nullptr;
+
+    // Structures for getting plane MFs
+    std::unique_ptr<PlaneSampler> m_ps = nullptr;
+};
+#endif
diff --git a/Source/IO/Make.package b/Source/IO/Make.package
index 4f6f079c9..d2369a85f 100644
--- a/Source/IO/Make.package
+++ b/Source/IO/Make.package
@@ -14,6 +14,8 @@ CEXE_sources += ERF_WriteScalarProfiles.cpp
 
 CEXE_sources += ERF_console_io.cpp
 
+CEXE_headers += ERF_SampleData.H
+
 ifeq ($(USE_NETCDF), TRUE)
   CEXE_sources += ERF_ReadFromWRFBdy.cpp
   CEXE_sources += ERF_ReadFromWRFInput.cpp
diff --git a/Source/Initialization/ERF_Metgrid_utils.H b/Source/Initialization/ERF_Metgrid_utils.H
index 0be366bc5..96f20ab3e 100644
--- a/Source/Initialization/ERF_Metgrid_utils.H
+++ b/Source/Initialization/ERF_Metgrid_utils.H
@@ -108,79 +108,6 @@ init_base_state_from_metgrid (const bool use_moisture,
                               const amrex::Vector<amrex::FArrayBox>& NC_psfc_fab,
                               amrex::Vector<amrex::Vector<amrex::FArrayBox>>& fabs_for_bcs);
 
-AMREX_FORCE_INLINE
-AMREX_GPU_DEVICE
-void
-calc_rho_p (const int& kmax,
-            const bool metgrid_debug_psfc,
-            const int& flag_psfc,
-            const amrex::Real& psfc,
-            const amrex::Real& grav,
-            amrex::Real* Thetad_vec,
-            amrex::Real* Thetam_vec,
-            amrex::Real* Q_vec,
-            amrex::Real* z_vec,
-            amrex::Real* Rhom_vec,
-            amrex::Real* Pm_vec)
-{
-    const int maxiter = 10;
-    const amrex::Real tol = 1.0e-10;
-
-    // Calculate or use moist pressure at the surface.
-    amrex::Real Psurf;
-    if (flag_psfc == 1) {
-        Psurf = psfc;
-    } else {
-        amrex::Real t_0 = 290.0; // WRF's model_config_rec%base_temp
-        amrex::Real a   = 50.0; // WRF's model_config_rec%base_lapse
-        Psurf = p_0*exp(-t_0/a+std::pow((std::pow(t_0/a, 2)-2.0*grav*z_vec[0]/(a*R_d)), 0.5));
-    }
-    if (metgrid_debug_psfc) Psurf = p_0;
-
-    // Iterations for the first CC point that is 1/2 dz off the surface
-    {
-        amrex::Real half_dz = z_vec[0];
-        amrex::Real qvf = 1.0+(R_v/R_d)*Q_vec[0];
-        Thetam_vec[0] = Thetad_vec[0]*qvf;
-        Rhom_vec[0] = 1.0; // an initial guess.
-        for (int it=0; it<maxiter; it++) {
-            Pm_vec[0] = Psurf-half_dz*(Rhom_vec[0])*(1.0+Q_vec[0])*grav;
-            if (Pm_vec[0] < 0.0) Pm_vec[0] = 0.0;
-            Rhom_vec[0] = (p_0/(R_d*Thetam_vec[0]))*std::pow(Pm_vec[0]/p_0, iGamma);
-        } // it
-    }
-
-    // Integrate from the first CC point to the top boundary.
-    for (int k=1; k<=kmax; k++) {
-
-        // Assignment only for downward integration in following loop
-        amrex::Real qvf = 1.0+(R_v/R_d)*Q_vec[k];
-        Thetam_vec[k]   = Thetad_vec[k]*qvf;
-
-        // Vertical grid spacing
-        amrex::Real dz = z_vec[k]-z_vec[k-1];
-
-        // Establish known constant
-        amrex::Real rho_tot_lo = Rhom_vec[k-1] * (1. + Q_vec[k-1]);
-        amrex::Real C = -Pm_vec[k-1] + 0.5*rho_tot_lo*grav*dz;
-
-        // Initial guess and residual
-        Pm_vec[k]   = Pm_vec[k-1];
-        Rhom_vec[k] = getRhogivenThetaPress(Thetad_vec[k],
-                                            Pm_vec[k],
-                                            R_d/Cp_d,
-                                            Q_vec[k]);
-        amrex::Real rho_tot_hi = Rhom_vec[k] * (1. + Q_vec[k]);
-        amrex::Real F = Pm_vec[k] + 0.5*rho_tot_hi*grav*dz + C;
-
-        // Do iterations
-        if (std::abs(F)>tol) HSEutils::Newton_Raphson_hse(tol, R_d/Cp_d, dz,
-                                                          grav, C, Thetad_vec[k],
-                                                          Q_vec[k], Q_vec[k],
-                                                          Pm_vec[k], Rhom_vec[k], F);
-    } // k
-}
-
 AMREX_FORCE_INLINE
 AMREX_GPU_DEVICE
 void
@@ -437,7 +364,7 @@ interpolate_column_metgrid (const bool& metgrid_use_below_sfc,
                             const int& i,
                             const int& j,
                             const int& src_comp,
-                            const int& it,
+                            const int& itime,
                             char var_type,
                             char stag,
                             const amrex::Array4<amrex::Real const>& orig_z_full,
@@ -457,9 +384,9 @@ interpolate_column_metgrid (const bool& metgrid_use_below_sfc,
     int kmax_orig = amrex::ubound(amrex::Box(orig_data)).z;
     int kmax_new  = amrex::ubound(amrex::Box(new_z_full)).z;
 
-    amrex::Array1D<amrex::Real, 0, 500> new_z, new_p, new_data; // length of kmax_new
-    amrex::Array1D<amrex::Real, 0, 200> orig_z, ordered_z, ordered_data; // length of kmax_orig
-    amrex::Array1D<amrex::Real, 0, 200> final_z, final_p, final_data; // length of kmax_orig
+    amrex::Array1D<amrex::Real, 0, 500> new_z, new_p, new_data; // max length of kmax_new is limited here
+    amrex::Array1D<amrex::Real, 0, 200> orig_z, ordered_z, ordered_data; // max length of kmax_orig is limited here
+    amrex::Array1D<amrex::Real, 0, 200> final_z, final_p, final_data; // max length of kmax_orig is limited here
     for (int k=0; k < kmax_new; k++) {
         if (stag == 'X') {
             new_z(k) = 0.25*(new_z_full(i,j,k)+new_z_full(i,j+1,k)+new_z_full(i,j,k+1)+new_z_full(i,j+1,k+1));
@@ -725,7 +652,7 @@ interpolate_column_metgrid (const bool& metgrid_use_below_sfc,
     // Save the interpolated data.
     for (int k=0; k < kmax_new; k++) {
         if ((mask(i,j,k)) && (update_bc_data)) bc_data_full(i,j,k,0) = new_data(k);
-        if (it == 0) new_data_full(i,j,k,src_comp) = new_data(k);
+        if (itime == 0) new_data_full(i,j,k,src_comp) = new_data(k);
     }
 
 }
diff --git a/Source/Initialization/ERF_init_from_metgrid.cpp b/Source/Initialization/ERF_init_from_metgrid.cpp
index 958b9429d..324c2da25 100644
--- a/Source/Initialization/ERF_init_from_metgrid.cpp
+++ b/Source/Initialization/ERF_init_from_metgrid.cpp
@@ -77,25 +77,35 @@ ERF::init_from_metgrid (int lev)
     Arena_Used = The_Pinned_Arena();
 #endif
 
-    for (int it = 0; it < ntimes; it++) {
 #ifndef AMREX_USE_GPU
-        Print() << " init_from_metgrid: reading nc_init_file[" << lev << "][" << it << "]\t" << nc_init_file[lev][it] << std::endl;
+    if (metgrid_debug_quiescent)  Print() << "metgrid_debug_quiescent  = true" << std::endl;
+    if (metgrid_debug_isothermal) Print() << "metgrid_debug_isothermal = true" << std::endl;
+    if (metgrid_debug_dry)        Print() << "metgrid_debug_dry        = true" << std::endl;
+    if (metgrid_debug_psfc)       Print() << "metgrid_debug_psfc       = true" << std::endl;
+    if (metgrid_debug_msf)        Print() << "metgrid_debug_msf        = true" << std::endl;
+    if (metgrid_interp_theta)     Print() << "metgrid_interp_theta     = true" << std::endl;
+    if (metgrid_basic_linear)     Print() << "metgrid_basic_linear     = true" << std::endl;
 #endif
-        read_from_metgrid(lev, boxes_at_level[lev][0], nc_init_file[lev][it],
-                          NC_dateTime[it], NC_epochTime[it],
-                          flag_psfc[it],   flag_msf[it],
-                          flag_sst[it],    flag_lmask[it],
-                          NC_nx[it],       NC_ny[it],       NC_dx[it],      NC_dy[it],
-                          NC_xvel_fab[it], NC_yvel_fab[it],
-                          NC_temp_fab[it], NC_rhum_fab[it], NC_pres_fab[it],
-                          NC_ght_fab[it],  NC_hgt_fab[it],  NC_psfc_fab[it],
-                          NC_MSFU_fab[it], NC_MSFV_fab[it], NC_MSFM_fab[it],
-                          NC_sst_fab[it],  NC_LAT_fab[it],  NC_LON_fab[it],
-                          NC_lmask_iab[it], Latitude,       Longitude,       geom[lev]);
-    } // it
+
+    for (int itime(0); itime < ntimes; itime++) {
+#ifndef AMREX_USE_GPU
+        Print() << " init_from_metgrid: reading nc_init_file[" << lev << "][" << itime << "]\t" << nc_init_file[lev][itime] << std::endl;
+#endif
+        read_from_metgrid(lev, boxes_at_level[lev][0], nc_init_file[lev][itime],
+                          NC_dateTime[itime],  NC_epochTime[itime],
+                          flag_psfc[itime],    flag_msf[itime],
+                          flag_sst[itime],     flag_lmask[itime],
+                          NC_nx[itime],        NC_ny[itime],       NC_dx[itime],      NC_dy[itime],
+                          NC_xvel_fab[itime],  NC_yvel_fab[itime],
+                          NC_temp_fab[itime],  NC_rhum_fab[itime], NC_pres_fab[itime],
+                          NC_ght_fab[itime],   NC_hgt_fab[itime],  NC_psfc_fab[itime],
+                          NC_MSFU_fab[itime],  NC_MSFV_fab[itime], NC_MSFM_fab[itime],
+                          NC_sst_fab[itime],   NC_LAT_fab[itime],  NC_LON_fab[itime],
+                          NC_lmask_iab[itime], Latitude,           Longitude,         geom[lev]);
+    } // itime
 
     // Verify that files in nc_init_file[lev] are ordered from earliest to latest.
-    for (int it = 1; it < ntimes; it++) AMREX_ALWAYS_ASSERT(NC_epochTime[it] > NC_epochTime[it-1]);
+    for (int itime(1); itime < ntimes; itime++) AMREX_ALWAYS_ASSERT(NC_epochTime[itime] > NC_epochTime[itime-1]);
 
     // Start at the earliest time in nc_init_file[lev].
     start_bdy_time = NC_epochTime[0];
@@ -106,10 +116,10 @@ ERF::init_from_metgrid (int lev)
     bdy_time_interval = NC_epochTime[1]-NC_epochTime[0];
 
     // Verify that met_em files have even spacing in time.
-    for (int it = 1; it < ntimes; it++) {
-        Real NC_dt = NC_epochTime[it]-NC_epochTime[it-1];
+    for (int itime(1); itime < ntimes; itime++) {
+        Real NC_dt = NC_epochTime[itime]-NC_epochTime[itime-1];
 #ifndef AMREX_USE_GPU
-        Print() << " " << nc_init_file[lev][it-1] << " / " << nc_init_file[lev][it] << " are " << NC_dt << " seconds apart" << std::endl;
+        Print() << " " << nc_init_file[lev][itime-1] << " / " << nc_init_file[lev][itime] << " are " << NC_dt << " seconds apart" << std::endl;
 #endif
         if (NC_dt != bdy_time_interval) Error("Time interval between consecutive met_em files must be consistent.");
     }
@@ -143,12 +153,12 @@ ERF::init_from_metgrid (int lev)
     int i_lo = geom[lev].Domain().smallEnd(0); int i_hi = geom[lev].Domain().bigEnd(0);
     int j_lo = geom[lev].Domain().smallEnd(1); int j_hi = geom[lev].Domain().bigEnd(1);
     if (flag_sst[0]) {
-        for (int it = 0; it < ntimes; ++it) {
-            sst_lev[lev][it] = std::make_unique<MultiFab>(ba2d,dm,1,ngv);
-            for ( MFIter mfi(*(sst_lev[lev][it]), TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        for (int itime(0); itime < ntimes; ++itime) {
+            sst_lev[lev][itime] = std::make_unique<MultiFab>(ba2d,dm,1,ngv);
+            for ( MFIter mfi(*(sst_lev[lev][itime]), TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
                 Box gtbx = mfi.growntilebox();
-                FArrayBox& dst = (*(sst_lev[lev][it]))[mfi];
-                FArrayBox& src = NC_sst_fab[it];
+                FArrayBox& dst = (*(sst_lev[lev][itime]))[mfi];
+                FArrayBox& src = NC_sst_fab[itime];
                 const Array4<      Real>& dst_arr = dst.array();
                 const Array4<const Real>& src_arr = src.const_array();
                 ParallelFor(gtbx, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
@@ -158,19 +168,19 @@ ERF::init_from_metgrid (int lev)
                     dst_arr(i,j,0) = src_arr(li,lj,0);
                 });
             }
-            sst_lev[lev][it]->FillBoundary(geom[lev].periodicity());
+            sst_lev[lev][itime]->FillBoundary(geom[lev].periodicity());
         }
     } else {
-        for (int it = 0; it < ntimes; ++it) sst_lev[lev][it] = nullptr;
+        for (int itime(0); itime < ntimes; ++itime) sst_lev[lev][itime] = nullptr;
     }
 
     if (flag_lmask[0]) {
-        for (int it = 0; it < ntimes; ++it) {
-            lmask_lev[lev][it] = std::make_unique<iMultiFab>(ba2d,dm,1,ngv);
-            for ( MFIter mfi(*(lmask_lev[lev][it]), TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
+        for (int itime(0); itime < ntimes; ++itime) {
+            lmask_lev[lev][itime] = std::make_unique<iMultiFab>(ba2d,dm,1,ngv);
+            for ( MFIter mfi(*(lmask_lev[lev][itime]), TilingIfNotGPU()); mfi.isValid(); ++mfi ) {
                 Box gtbx = mfi.growntilebox();
-                IArrayBox& dst = (*(lmask_lev[lev][it]))[mfi];
-                IArrayBox& src = NC_lmask_iab[it];
+                IArrayBox& dst = (*(lmask_lev[lev][itime]))[mfi];
+                IArrayBox& src = NC_lmask_iab[itime];
                 const Array4<      int>& dst_arr = dst.array();
                 const Array4<const int>& src_arr = src.const_array();
                 ParallelFor(gtbx, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
@@ -180,7 +190,7 @@ ERF::init_from_metgrid (int lev)
                     dst_arr(i,j,0) = src_arr(li,lj,0);
                 });
             }
-            lmask_lev[lev][it]->FillBoundary(geom[lev].periodicity());
+            lmask_lev[lev][itime]->FillBoundary(geom[lev].periodicity());
         }
     }
 
@@ -214,15 +224,15 @@ ERF::init_from_metgrid (int lev)
         });
     }
 
-    for (int it = 0; it < ntimes; it++) {
+    for (int itime(0); itime < ntimes; itime++) {
         // Verify that the grid size and resolution from met_em file matches that in geom (from ERF inputs file).
-        AMREX_ALWAYS_ASSERT(geom[lev].CellSizeArray()[0] == NC_dx[it]);
-        AMREX_ALWAYS_ASSERT(geom[lev].CellSizeArray()[1] == NC_dy[it]);
+        AMREX_ALWAYS_ASSERT(geom[lev].CellSizeArray()[0] == NC_dx[itime]);
+        AMREX_ALWAYS_ASSERT(geom[lev].CellSizeArray()[1] == NC_dy[itime]);
         // NC_nx-2 because NC_nx is the number of staggered grid points indexed from 1.
-        AMREX_ALWAYS_ASSERT(geom[lev].Domain().bigEnd(0) == NC_nx[it]-2);
+        AMREX_ALWAYS_ASSERT(geom[lev].Domain().bigEnd(0) == NC_nx[itime]-2);
         // NC_ny-2 because NC_ny is the number of staggered grid points indexed from 1.
-        AMREX_ALWAYS_ASSERT(geom[lev].Domain().bigEnd(1) == NC_ny[it]-2);
-    } // it
+        AMREX_ALWAYS_ASSERT(geom[lev].Domain().bigEnd(1) == NC_ny[itime]-2);
+    } // itime
 
     // This makes the Jacobian.
     make_J(geom[lev],*z_phys,  *detJ_cc[lev]);
@@ -240,13 +250,13 @@ ERF::init_from_metgrid (int lev)
     Print() << "ntimes = " << ntimes << std::endl;
     Vector<Vector<FArrayBox>> fabs_for_bcs;
     fabs_for_bcs.resize(ntimes);
-    for (int it(0); it < ntimes; it++) {
-        fabs_for_bcs[it].resize(MetGridBdyEnd);
+    for (int itime(0); itime < ntimes; itime++) {
+        fabs_for_bcs[itime].resize(MetGridBdyEnd);
 
         Box gdomain;
         Box ldomain;
         for (int nvar(0); nvar<MetGridBdyEnd; ++nvar) {
-            Print() << "it=" << it << "\t  nvar=" << nvar << std::endl;
+            Print() << "itime=" << itime << "\t  nvar=" << nvar << std::endl;
             if (nvar==MetGridBdyVars::U) {
                 ldomain = geom[lev].Domain();
                 ldomain.convert(IntVect(1,0,0));
@@ -263,8 +273,8 @@ ERF::init_from_metgrid (int lev)
                 gdomain = grow(ldomain,ng);
             }
             Print() << "\t  gdomain = " << gdomain << std::endl;
-            fabs_for_bcs[it][nvar].resize(gdomain, 1, Arena_Used);
-            fabs_for_bcs[it][nvar].template setVal<RunOn::Device>(0.0);
+            fabs_for_bcs[itime][nvar].resize(gdomain, 1, Arena_Used);
+            fabs_for_bcs[itime][nvar].template setVal<RunOn::Device>(0.0);
         }
     }
 
@@ -383,10 +393,10 @@ ERF::init_from_metgrid (int lev)
     //       So when we save the data in fabs_for_bc, only regions owned
     //       by the rank are populated. Use an allreduce sum to make the
     //       complete data set; initialized to 0 above.
-    for (int it(0); it < ntimes; it++) {
+    for (int itime(0); itime < ntimes; itime++) {
         for (int nvar(0); nvar<MetGridBdyEnd; ++nvar) {
-            ParallelAllReduce::Sum(fabs_for_bcs[it][nvar].dataPtr(),
-                                   fabs_for_bcs[it][nvar].size(),
+            ParallelAllReduce::Sum(fabs_for_bcs[itime][nvar].dataPtr(),
+                                   fabs_for_bcs[itime][nvar].size(),
                                    ParallelContext::CommunicatorAll());
         }
     }
@@ -439,34 +449,34 @@ ERF::init_from_metgrid (int lev)
     Box yhi_plane_y_stag = pbx_yhi; yhi_plane_y_stag.shiftHalf(1,1);
 
     for (int ivar(MetGridBdyVars::U); ivar < MetGridBdyEnd; ivar++) {
-        for (int it(0); it < ntimes; it++) {
+        for (int itime(0); itime < ntimes; itime++) {
             if (ivar == MetGridBdyVars::U) {
-                bdy_data_xlo[it].push_back(FArrayBox(xlo_plane_x_stag, 1));
-                bdy_data_xhi[it].push_back(FArrayBox(xhi_plane_x_stag, 1));
-                bdy_data_ylo[it].push_back(FArrayBox(ylo_plane_x_stag, 1));
-                bdy_data_yhi[it].push_back(FArrayBox(yhi_plane_x_stag, 1));
+                bdy_data_xlo[itime].push_back(FArrayBox(xlo_plane_x_stag, 1));
+                bdy_data_xhi[itime].push_back(FArrayBox(xhi_plane_x_stag, 1));
+                bdy_data_ylo[itime].push_back(FArrayBox(ylo_plane_x_stag, 1));
+                bdy_data_yhi[itime].push_back(FArrayBox(yhi_plane_x_stag, 1));
             } else if (ivar == MetGridBdyVars::V) {
-                bdy_data_xlo[it].push_back(FArrayBox(xlo_plane_y_stag, 1));
-                bdy_data_xhi[it].push_back(FArrayBox(xhi_plane_y_stag, 1));
-                bdy_data_ylo[it].push_back(FArrayBox(ylo_plane_y_stag, 1));
-                bdy_data_yhi[it].push_back(FArrayBox(yhi_plane_y_stag, 1));
+                bdy_data_xlo[itime].push_back(FArrayBox(xlo_plane_y_stag, 1));
+                bdy_data_xhi[itime].push_back(FArrayBox(xhi_plane_y_stag, 1));
+                bdy_data_ylo[itime].push_back(FArrayBox(ylo_plane_y_stag, 1));
+                bdy_data_yhi[itime].push_back(FArrayBox(yhi_plane_y_stag, 1));
             } else if (ivar == MetGridBdyVars::T) {
-                bdy_data_xlo[it].push_back(FArrayBox(xlo_plane_no_stag, 1));
-                bdy_data_xhi[it].push_back(FArrayBox(xhi_plane_no_stag, 1));
-                bdy_data_ylo[it].push_back(FArrayBox(ylo_plane_no_stag, 1));
-                bdy_data_yhi[it].push_back(FArrayBox(yhi_plane_no_stag, 1));
+                bdy_data_xlo[itime].push_back(FArrayBox(xlo_plane_no_stag, 1));
+                bdy_data_xhi[itime].push_back(FArrayBox(xhi_plane_no_stag, 1));
+                bdy_data_ylo[itime].push_back(FArrayBox(ylo_plane_no_stag, 1));
+                bdy_data_yhi[itime].push_back(FArrayBox(yhi_plane_no_stag, 1));
             } else if (ivar == MetGridBdyVars::QV) {
-                bdy_data_xlo[it].push_back(FArrayBox(xlo_plane_no_stag, 1));
-                bdy_data_xhi[it].push_back(FArrayBox(xhi_plane_no_stag, 1));
-                bdy_data_ylo[it].push_back(FArrayBox(ylo_plane_no_stag, 1));
-                bdy_data_yhi[it].push_back(FArrayBox(yhi_plane_no_stag, 1));
+                bdy_data_xlo[itime].push_back(FArrayBox(xlo_plane_no_stag, 1));
+                bdy_data_xhi[itime].push_back(FArrayBox(xhi_plane_no_stag, 1));
+                bdy_data_ylo[itime].push_back(FArrayBox(ylo_plane_no_stag, 1));
+                bdy_data_yhi[itime].push_back(FArrayBox(yhi_plane_no_stag, 1));
             } else {
 #ifndef AMREX_USE_GPU
                 Print() << "Unexpected ivar " << ivar << std::endl;
 #endif
                 Abort("See Initialization/ERF_init_from_metgrid.cpp");
             }
-        } // it
+        } // itime
     } // ivar
 
     // Earlier we processed the entire domain at each time from the met_em files, even though
@@ -474,15 +484,15 @@ ERF::init_from_metgrid (int lev)
     // at subsequent times. We can optimize this later if needed. For now, we need to fill
     // the lateral boundary arrays using the info set aside earlier.
     Box xlo_plane, xhi_plane, ylo_plane, yhi_plane;
-    for (int it(0); it < ntimes; it++) {
+    for (int itime(0); itime < ntimes; itime++) {
 
         for (int ivar(MetGridBdyVars::U); ivar < MetGridBdyEnd; ivar++) {
 
-            auto xlo_arr = bdy_data_xlo[it][ivar].array();
-            auto xhi_arr = bdy_data_xhi[it][ivar].array();
-            auto ylo_arr = bdy_data_ylo[it][ivar].array();
-            auto yhi_arr = bdy_data_yhi[it][ivar].array();
-            const Array4<Real const>& fabs_for_bcs_arr = fabs_for_bcs[it][ivar].const_array();
+            auto xlo_arr = bdy_data_xlo[itime][ivar].array();
+            auto xhi_arr = bdy_data_xhi[itime][ivar].array();
+            auto ylo_arr = bdy_data_ylo[itime][ivar].array();
+            auto yhi_arr = bdy_data_yhi[itime][ivar].array();
+            const Array4<Real const>& fabs_for_bcs_arr = fabs_for_bcs[itime][ivar].const_array();
 
             if (ivar == MetGridBdyVars::U) {
                 xlo_plane = xlo_plane_x_stag; xhi_plane = xhi_plane_x_stag;
@@ -520,7 +530,7 @@ ERF::init_from_metgrid (int lev)
             });
 
         } // ivar
-    } // it
+    } // itime
 }
 
 /**
@@ -535,12 +545,12 @@ init_terrain_from_metgrid (FArrayBox& z_phys_nd_fab,
 {
    int ntimes = 1; // Use terrain from the first met_em file.
 
-   for (int it = 0; it < ntimes; it++) {
+   for (int itime(0); itime < ntimes; itime++) {
         // This copies from NC_zphys on z-faces to z_phys_nd on nodes
         const Array4<Real      >&      z_arr = z_phys_nd_fab.array();
-        const Array4<Real const>& nc_hgt_arr = NC_hgt_fab[it].const_array();
+        const Array4<Real const>& nc_hgt_arr = NC_hgt_fab[itime].const_array();
 
-        const Box z_hgt_box = NC_hgt_fab[it].box();
+        const Box z_hgt_box = NC_hgt_fab[itime].box();
 
         int ilo = z_hgt_box.smallEnd()[0];
         int ihi = z_hgt_box.bigEnd()[0];
@@ -548,7 +558,7 @@ init_terrain_from_metgrid (FArrayBox& z_phys_nd_fab,
         int jhi = z_hgt_box.bigEnd()[1];
 
         Box z_phys_box = z_phys_nd_fab.box();
-        Box from_box = surroundingNodes(NC_hgt_fab[it].box());
+        Box from_box = surroundingNodes(NC_hgt_fab[itime].box());
         from_box.growHi(2,-1);
 
         Box bx = z_phys_box & from_box;
@@ -562,7 +572,7 @@ init_terrain_from_metgrid (FArrayBox& z_phys_nd_fab,
             z_arr(i,j,k) =  0.25 * ( nc_hgt_arr (ii,jj  ,k) + nc_hgt_arr(ii-1,jj  ,k) +
                                      nc_hgt_arr (ii,jj-1,k) + nc_hgt_arr(ii-1,jj-1,k) );
         });
-    } // it
+    } // itime
 }
 
 /**
@@ -594,10 +604,10 @@ init_terrain_from_metgrid (FArrayBox& z_phys_nd_fab,
  * @param NC_temp_fab Vector of FArrayBox objects holding metgrid data for temperature
  * @param NC_rhum_fab Vector of FArrayBox objects holding metgrid data for relative humidity
  * @param NC_pres_fab Vector of FArrayBox objects holding metgrid data for pressure
- * @param p_interp_fab Vector of FArrayBox objects
- * @param t_interp_fab Vector of FArrayBox objects
- * @param theta_fab Vector of FArrayBox objects holding potential temperature calculated from temperature and pressure
- * @param mxrat_fab Vector of FArrayBox objects holding vapor mixing ratio calculated from relative humidity
+ * @param p_interp_fab FArrayBox object
+ * @param t_interp_fab FArrayBox object
+ * @param theta_fab FArrayBox object holding potential temperature calculated from temperature and pressure
+ * @param mxrat_fab FArrayBox object holding vapor mixing ratio calculated from relative humidity
  * @param fabs_for_bcs Vector of Vector of FArrayBox objects holding MetGridBdyVars at each met_em time.
  * @param mask_c_arr
  * @param mask_u_arr
@@ -645,7 +655,7 @@ init_state_from_metgrid (const bool use_moisture,
 
     // Loop over each time in the origin data.
     int ntimes = NC_hgt_fab.size();
-    for (int it = 0; it < ntimes; it++)
+    for (int itime(0); itime < ntimes; itime++)
     {
 
         // ********************************************************
@@ -653,39 +663,38 @@ init_state_from_metgrid (const bool use_moisture,
         // ********************************************************
         {
 #ifndef AMREX_USE_GPU
-        Print() << "[init_state_from_metgrid] vertical interpolation of u-velocity, it = " << it << std::endl;
+        Print() << "[init_state_from_metgrid] vertical interpolation of u-velocity, itime = " << itime << std::endl;
 #endif
-        Box bx2d = NC_xvel_fab[it].box() & tbxu;
+        Box bx2d = NC_xvel_fab[itime].box() & tbxu;
         bx2d.setRange(2,0);
-        auto const orig_data = NC_xvel_fab[it].const_array();
-        auto const orig_z    = NC_ght_fab[it].const_array();
+        auto const orig_data = NC_xvel_fab[itime].const_array();
+        auto const orig_z    = NC_ght_fab[itime].const_array();
         auto       new_data  = x_vel_fab.array();
-        auto       bc_data   = fabs_for_bcs[it][MetGridBdyVars::U].array();
+        auto       bc_data   = fabs_for_bcs[itime][MetGridBdyVars::U].array();
         auto const new_z     = z_phys_nd_fab.const_array();
 
         int kmax = ubound(tbxu).z;
 
         ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
         {
-            if (metgrid_basic_linear) {
-                for (int k = 0; k<=kmax; k++) {
+            if (metgrid_debug_quiescent) { // Debugging option to run quiescent.
+                for (int k(0); k<=kmax; k++) {
+                    if (mask_u_arr(i,j,k)) bc_data(i,j,k,0) = 0.0;
+                    if (itime == 0) new_data(i,j,k,0) = 0.0;
+                }
+            } else if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                for (int k(0); k<=kmax; k++) {
                     Real Interp_Val = interpolate_column_metgrid_linear(i,j,k,'X',0,orig_z,orig_data,new_z);
                     if (mask_u_arr(i,j,k)) bc_data(i,j,k,0) = Interp_Val;
-                    if (it==0) new_data(i,j,k,0) = Interp_Val;
+                    if (itime == 0) new_data(i,j,k,0) = Interp_Val;
                 }
-            } else {
+            } else { // Vertical interpolation and quality control similar to that from WRF.
                 interpolate_column_metgrid(metgrid_use_below_sfc, metgrid_use_sfc, metgrid_exp_interp,
                                            metgrid_retain_sfc, metgrid_proximity, metgrid_order,
-                                           metgrid_force_sfc_k, i, j, 0, it, 'U', 'X',
+                                           metgrid_force_sfc_k, i, j, 0, itime, 'U', 'X',
                                            orig_z, orig_data, new_z, new_data,
                                            true, bc_data, mask_u_arr);
             }
-            if (metgrid_debug_quiescent) { // Debugging option to run quiescent.
-                for (int k = 0; k<=kmax; k++) {
-                    if (mask_u_arr(i,j,k)) bc_data(i,j,k,0) = 0.0;
-                    if (it==0) new_data(i,j,k,0) = 0.0;
-                }
-            }
         });
         }
 
@@ -695,39 +704,38 @@ init_state_from_metgrid (const bool use_moisture,
         // ********************************************************
         {
 #ifndef AMREX_USE_GPU
-        Print() << "[init_state_from_metgrid] vertical interpolation of v-velocity, it = " << it << std::endl;
+        Print() << "[init_state_from_metgrid] vertical interpolation of v-velocity, itime = " << itime << std::endl;
 #endif
-        Box bx2d = NC_yvel_fab[it].box() & tbxv;
+        Box bx2d = NC_yvel_fab[itime].box() & tbxv;
         bx2d.setRange(2,0);
-        auto const orig_data = NC_yvel_fab[it].const_array();
-        auto const orig_z    = NC_ght_fab[it].const_array();
+        auto const orig_data = NC_yvel_fab[itime].const_array();
+        auto const orig_z    = NC_ght_fab[itime].const_array();
         auto       new_data  = y_vel_fab.array();
-        auto       bc_data   = fabs_for_bcs[it][MetGridBdyVars::V].array();
+        auto       bc_data   = fabs_for_bcs[itime][MetGridBdyVars::V].array();
         auto const new_z     = z_phys_nd_fab.const_array();
 
         int kmax = ubound(tbxv).z;
 
         ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
         {
-            if (metgrid_basic_linear) {
-                for (int k = 0; k<=kmax; k++) {
+            if (metgrid_debug_quiescent) { // Debugging option to run quiescent.
+                for (int k(0); k<=kmax; k++) {
+                    if (mask_v_arr(i,j,k)) bc_data(i,j,k,0) = 0.0;
+                    if (itime == 0) new_data(i,j,k,0) = 0.0;
+                }
+            } else if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                for (int k(0); k<=kmax; k++) {
                     Real Interp_Val = interpolate_column_metgrid_linear(i,j,k,'Y',0,orig_z,orig_data,new_z);
                     if (mask_v_arr(i,j,k)) bc_data(i,j,k,0) = Interp_Val;
-                    if (it==0) new_data(i,j,k,0) = Interp_Val;
+                    if (itime == 0) new_data(i,j,k,0) = Interp_Val;
                 }
             } else {
                 interpolate_column_metgrid(metgrid_use_below_sfc, metgrid_use_sfc, metgrid_exp_interp,
                                            metgrid_retain_sfc, metgrid_proximity, metgrid_order,
-                                           metgrid_force_sfc_k, i, j, 0, it, 'V', 'Y',
+                                           metgrid_force_sfc_k, i, j, 0, itime, 'V', 'Y',
                                            orig_z, orig_data, new_z, new_data,
                                            true, bc_data, mask_v_arr);
             }
-            if (metgrid_debug_quiescent) { // Debugging option to run quiescent.
-                for (int k = 0; k<=kmax; k++) {
-                    if (mask_v_arr(i,j,k)) bc_data(i,j,k,0) = 0.0; //60.0*it;
-                    if (it==0) new_data(i,j,k,0) = 0.0; //60.0*it;
-                }
-            }
         });
         }
 
@@ -735,7 +743,7 @@ init_state_from_metgrid (const bool use_moisture,
         // ********************************************************
         // W
         // ********************************************************
-        if (it == 0) { // update at initialization
+        if (itime == 0) { // update at initialization
             z_vel_fab.template setVal<RunOn::Device>(0.0);
         }
 
@@ -743,7 +751,7 @@ init_state_from_metgrid (const bool use_moisture,
         // ********************************************************
         // Initialize all state_fab variables to zero
         // ********************************************************
-        if (it == 0) { // update at initialization
+        if (itime == 0) { // update at initialization
             state_fab.template setVal<RunOn::Device>(0.0);
         }
 
@@ -756,9 +764,9 @@ init_state_from_metgrid (const bool use_moisture,
             // then interpolate that onto the ERF vertical levels.
 
             { // calculate potential temperature.
-                Box bx = NC_rhum_fab[it].box() & tbxc;
-                auto const temp  = NC_temp_fab[it].const_array();
-                auto const pres  = NC_pres_fab[it].const_array();
+                Box bx = NC_rhum_fab[itime].box() & tbxc;
+                auto const temp  = NC_temp_fab[itime].const_array();
+                auto const pres  = NC_pres_fab[itime].const_array();
                 auto       theta = theta_fab.array();
 
                 ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
@@ -769,39 +777,38 @@ init_state_from_metgrid (const bool use_moisture,
 
             { // vertical interpolation of potential temperature.
 #ifndef AMREX_USE_GPU
-            Print() << "[init_state_from_metgrid] vertical interpolation of potential temperature, it = " << it << std::endl;
+            Print() << "[init_state_from_metgrid] vertical interpolation of potential temperature, itime = " << itime << std::endl;
 #endif
-            Box bx2d = NC_temp_fab[it].box() & tbxc;
+            Box bx2d = NC_temp_fab[itime].box() & tbxc;
             bx2d.setRange(2,0);
             auto const orig_data = theta_fab.const_array();
-            auto const orig_z    = NC_ght_fab[it].const_array();
+            auto const orig_z    = NC_ght_fab[itime].const_array();
             auto       new_data  = state_fab.array();
-            auto       bc_data   = fabs_for_bcs[it][MetGridBdyVars::T].array();
+            auto       bc_data   = fabs_for_bcs[itime][MetGridBdyVars::T].array();
             auto const new_z     = z_phys_nd_fab.const_array();
 
             int kmax = amrex::ubound(tbxc).z;
 
             ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
             {
-                if (metgrid_basic_linear) {
-                    for (int k = 0; k<=kmax; k++) {
+                if (metgrid_debug_isothermal) { // Debugging option to run isothermal.
+                    for (int k(0); k<=kmax; k++) {
+                        if (mask_c_arr(i,j,k)) bc_data(i,j,k,0)  = 300.0;
+                        if (itime == 0) new_data(i,j,k,RhoTheta_comp) = 300.0;
+                    }
+                } else if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                    for (int k(0); k<=kmax; k++) {
                         Real Interp_Val = interpolate_column_metgrid_linear(i,j,k,'M',0,orig_z,orig_data,new_z);
                         if (mask_c_arr(i,j,k)) bc_data(i,j,k,0)  = Interp_Val;
-                        if (it==0) new_data(i,j,k,RhoTheta_comp) = Interp_Val;
+                        if (itime == 0) new_data(i,j,k,RhoTheta_comp) = Interp_Val;
                     }
-                } else {
+                } else { // Vertical interpolation and quality control similar to that from WRF.
                     interpolate_column_metgrid(metgrid_use_below_sfc, metgrid_use_sfc, metgrid_exp_interp,
                                                metgrid_retain_sfc, metgrid_proximity, metgrid_order,
-                                               metgrid_force_sfc_k, i, j, RhoTheta_comp, it, 'T', 'M',
+                                               metgrid_force_sfc_k, i, j, RhoTheta_comp, itime, 'T', 'M',
                                                orig_z, orig_data, new_z, new_data,
                                                true, bc_data, mask_c_arr);
                 }
-                if (metgrid_debug_isothermal) { // Debugging option to run isothermal.
-                    for (int k = 0; k<=kmax; k++) {
-                        if (mask_c_arr(i,j,k)) bc_data(i,j,k,0)  = 300.0;
-                        if (it==0) new_data(i,j,k,RhoTheta_comp) = 300.0;
-                    }
-                }
             });
             }
 
@@ -809,12 +816,12 @@ init_state_from_metgrid (const bool use_moisture,
 
             { // vertical interpolation of pressure.
 #ifndef AMREX_USE_GPU
-            Print() << "[init_state_from_metgrid] vertical interpolation of pressure, it = " << it << std::endl;
+            Print() << "[init_state_from_metgrid] vertical interpolation of pressure, itime = " << itime << std::endl;
 #endif
             Box bx2d = p_interp_fab.box() & tbxc;
             bx2d.setRange(2,0);
-            auto const orig_data = NC_pres_fab[it].const_array();
-            auto const orig_z    = NC_ght_fab[it].const_array();
+            auto const orig_data = NC_pres_fab[itime].const_array();
+            auto const orig_z    = NC_ght_fab[itime].const_array();
             auto       new_data  = p_interp_fab.array();
             auto const new_z     = z_phys_nd_fab.const_array();
             const amrex::Array4<amrex::Real> bc_data_unused;
@@ -823,12 +830,12 @@ init_state_from_metgrid (const bool use_moisture,
 
             ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
             {
-                if (metgrid_basic_linear) {
-                    for (int k = 0; k<=kmax; k++) {
+                if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                    for (int k(0); k<=kmax; k++) {
                         Real Interp_Val = interpolate_column_metgrid_linear(i,j,k,'M',0,orig_z,orig_data,new_z);
                         new_data(i,j,k) = Interp_Val;
                     }
-                } else {
+                } else { // Vertical interpolation and quality control similar to that from WRF.
                     // Interpolate pressure not w.r.t. z but rather p_0*exp(-CONST_GRAV*z/(t_0*R_d)).
                     // This is akin to interpolating in pressure-space assuming a baroclinic atmosphere.
                     interpolate_column_metgrid(metgrid_use_below_sfc, metgrid_use_sfc, true,
@@ -842,12 +849,12 @@ init_state_from_metgrid (const bool use_moisture,
 
             { // vertical interpolation of temperature.
 #ifndef AMREX_USE_GPU
-            Print() << "[init_state_from_metgrid] vertical interpolation of temperature, it = " << it << std::endl;
+            Print() << "[init_state_from_metgrid] vertical interpolation of temperature, itime = " << itime << std::endl;
 #endif
             Box bx2d = p_interp_fab.box() & tbxc;
             bx2d.setRange(2,0);
-            auto const orig_data = NC_temp_fab[it].const_array();
-            auto const orig_z    = NC_ght_fab[it].const_array();
+            auto const orig_data = NC_temp_fab[itime].const_array();
+            auto const orig_z    = NC_ght_fab[itime].const_array();
             auto       new_data  = t_interp_fab.array();
             auto const new_z     = z_phys_nd_fab.const_array();
             const amrex::Array4<amrex::Real> bc_data_unused;
@@ -856,12 +863,12 @@ init_state_from_metgrid (const bool use_moisture,
 
             ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
             {
-                if (metgrid_basic_linear) {
-                    for (int k = 0; k<=kmax; k++) {
+                if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                    for (int k(0); k<=kmax; k++) {
                         Real Interp_Val = interpolate_column_metgrid_linear(i,j,k,'M',0,orig_z,orig_data,new_z);
                         new_data(i,j,k) = Interp_Val;
                     }
-                } else {
+                } else { // Vertical interpolation and quality control similar to that from WRF.
                     // According to WRF's code comments, "It is better to
                     // interpolate temperature and potential temperature
                     // in LOG(p), regardless of requested default."
@@ -878,14 +885,14 @@ init_state_from_metgrid (const bool use_moisture,
             auto const temp  = t_interp_fab.const_array();
             auto const pres  = p_interp_fab.const_array();
             auto       new_data = state_fab.array();
-            auto       bc_data   = fabs_for_bcs[it][MetGridBdyVars::T].array();
+            auto       bc_data   = fabs_for_bcs[itime][MetGridBdyVars::T].array();
 
             ParallelFor(tbxc, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
             {
                 Real Calc_Val = getThgivenPandT(temp(i,j,k),pres(i,j,k),l_rdOcp);
                 if (metgrid_debug_isothermal) Calc_Val = 300.0; // Debugging option to run isothermal.
                 if (mask_c_arr(i,j,k)) bc_data(i,j,k,0)  = Calc_Val;
-                if (it==0) new_data(i,j,k,RhoTheta_comp) = Calc_Val;
+                if (itime == 0) new_data(i,j,k,RhoTheta_comp) = Calc_Val;
             });
             }
 
@@ -901,10 +908,10 @@ init_state_from_metgrid (const bool use_moisture,
             // could be specific humidity or a mixing ratio.
             //
             { // calculate vapor mixing ratio from relative humidity.
-                Box bx = NC_temp_fab[it].box() & tbxc;
-                auto const rhum  = NC_rhum_fab[it].const_array();
-                auto const temp  = NC_temp_fab[it].const_array();
-                auto const pres  = NC_pres_fab[it].const_array();
+                Box bx = NC_temp_fab[itime].box() & tbxc;
+                auto const rhum  = NC_rhum_fab[itime].const_array();
+                auto const temp  = NC_temp_fab[itime].const_array();
+                auto const pres  = NC_pres_fab[itime].const_array();
                 auto       mxrat = mxrat_fab.array();
 
                 ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
@@ -915,14 +922,14 @@ init_state_from_metgrid (const bool use_moisture,
 
             { // vertical interpolation of vapor mixing ratio.
 #ifndef AMREX_USE_GPU
-                Print() << "[init_state_from_metgrid] vertical interpolation of vapor mixing ratio, it = " << it << std::endl;
+                Print() << "[init_state_from_metgrid] vertical interpolation of vapor mixing ratio, itime = " << itime << std::endl;
 #endif
-                Box bx2d = NC_temp_fab[it].box() & tbxc;
+                Box bx2d = NC_temp_fab[itime].box() & tbxc;
                 bx2d.setRange(2,0);
                 auto const orig_data = mxrat_fab.const_array();
-                auto const orig_z    = NC_ght_fab[it].const_array();
+                auto const orig_z    = NC_ght_fab[itime].const_array();
                 auto       new_data  = state_fab.array();
-                auto       bc_data   = fabs_for_bcs[it][MetGridBdyVars::QV].array();
+                auto       bc_data   = fabs_for_bcs[itime][MetGridBdyVars::QV].array();
                 auto const new_z     = z_phys_nd_fab.const_array();
 
                 int kmax = ubound(tbxc).z;
@@ -930,30 +937,29 @@ init_state_from_metgrid (const bool use_moisture,
                 int state_indx = RhoQ1_comp;
                 ParallelFor(bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
                 {
-                    if (metgrid_basic_linear) {
-                        for (int k = 0; k<=kmax; k++) {
+                    if (metgrid_debug_dry) { // Debugging option to run dry.
+                        for (int k(0); k<=kmax; k++) {
+                            if (mask_c_arr(i,j,k)) bc_data(i,j,k,0) = 0.0;
+                            if (itime == 0) new_data(i,j,k,state_indx)   = 0.0;
+                        }
+                    } else if (metgrid_basic_linear) { // Linear interpolation with no quality control.
+                        for (int k(0); k<=kmax; k++) {
                             Real Interp_Val  = interpolate_column_metgrid_linear(i,j,k,'M',0,orig_z,orig_data,new_z);
                             if (mask_c_arr(i,j,k)) bc_data(i,j,k,0) = Interp_Val;
-                            if (it==0) new_data(i,j,k,state_indx)   = Interp_Val;
+                            if (itime == 0) new_data(i,j,k,state_indx)   = Interp_Val;
                         }
-                    } else {
+                    } else { // Vertical interpolation and quality control similar to that from WRF.
                         interpolate_column_metgrid(metgrid_use_below_sfc, metgrid_use_sfc, metgrid_exp_interp,
                                                    metgrid_retain_sfc, metgrid_proximity, metgrid_order,
-                                                   metgrid_force_sfc_k, i, j, state_indx, it, 'Q', 'M',
+                                                   metgrid_force_sfc_k, i, j, state_indx, itime, 'Q', 'M',
                                                    orig_z, orig_data, new_z, new_data,
                                                    true, bc_data, mask_c_arr);
                     }
-                    if (metgrid_debug_dry) { // Debugging option to run dry.
-                        for (int k = 0; k<=kmax; k++) {
-                            if (mask_c_arr(i,j,k)) bc_data(i,j,k,0) = 0.0;
-                            if (it==0) new_data(i,j,k,state_indx)   = 0.0;
-                        }
-                    }
                 });
             }
         } // use_moisture
 
-    } // it
+    } // itime
 }
 
 
@@ -1006,14 +1012,6 @@ init_base_state_from_metgrid (const bool use_moisture,
     gvbx_ylo.makeSlab(1,gvbx_ylo.smallEnd(1)); gvbx_yhi.makeSlab(1,gvbx_yhi.bigEnd(1));
     gvbx_zlo.makeSlab(2,gvbx_zlo.smallEnd(2)); gvbx_zhi.makeSlab(2,gvbx_zhi.bigEnd(2));
 
-    // Device vectors for columnwise operations
-    Gpu::DeviceVector<Real>      z_vec_d(kmax+2,0); Real* z_vec      =      z_vec_d.data();
-    Gpu::DeviceVector<Real> Thetad_vec_d(kmax+1,0); Real* Thetad_vec = Thetad_vec_d.data();
-    Gpu::DeviceVector<Real> Thetam_vec_d(kmax+1,0); Real* Thetam_vec = Thetam_vec_d.data();
-    Gpu::DeviceVector<Real>   Rhom_vec_d(kmax+1,0); Real* Rhom_vec   =   Rhom_vec_d.data();
-    Gpu::DeviceVector<Real>     Pm_vec_d(kmax+1,0); Real* Pm_vec     =     Pm_vec_d.data();
-    Gpu::DeviceVector<Real>      Q_vec_d(kmax+1,0); Real* Q_vec      =      Q_vec_d.data();
-
     // Device vectors for psfc flags
     Gpu::DeviceVector<int>flag_psfc_d(flag_psfc.size());
     Gpu::copy(Gpu::hostToDevice, flag_psfc.begin(), flag_psfc.end(), flag_psfc_d.begin());
@@ -1022,8 +1020,8 @@ init_base_state_from_metgrid (const bool use_moisture,
     // Define the arena to be used for data allocation
     Arena* Arena_Used = The_Arena();
 #ifdef AMREX_USE_GPU
-    // Make sure this lives on CPU and GPU
-    Arena_Used = The_Pinned_Arena();
+    // Inside MFiter use async arena
+    Arena_Used = The_Async_Arena();
 #endif
     // Expose for copy to GPU
     Real grav = CONST_GRAV;
@@ -1032,6 +1030,7 @@ init_base_state_from_metgrid (const bool use_moisture,
         const Array4<Real>& r_hse_arr  = r_hse_fab.array();
         const Array4<Real>& p_hse_arr  = p_hse_fab.array();
         const Array4<Real>& pi_hse_arr = pi_hse_fab.array();
+        auto psfc_flag = flag_psfc_vec[0];
 
         // ********************************************************
         // calculate density and pressure for initial conditions.
@@ -1044,21 +1043,88 @@ init_base_state_from_metgrid (const bool use_moisture,
 
         ParallelFor(valid_bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
         {
-            for (int k=0; k<=kmax; k++) {
-                     z_vec[k] = new_z(i,j,k);
-                Thetad_vec[k] = new_data(i,j,k,RhoTheta_comp);
-                     Q_vec[k] = (use_moisture) ? new_data(i,j,k,RhoQ_comp) : 0.0;
+            const int maxiter = 10;
+            const amrex::Real tol = 1.0e-10;
+
+            // Low and Hi column variables
+            Real psurf;
+            Real z_lo,   z_hi;
+            Real p_lo,   p_hi;
+            Real qv_lo, qv_hi;
+            Real rd_lo, rd_hi;
+            Real thetad_lo, thetad_hi;
+
+            // Calculate or use pressure at the surface.
+            if (metgrid_debug_psfc) {
+                psurf = std::pow(10, 5);
+            } else if (psfc_flag == 1) {
+                psurf = orig_psfc(i,j,0);
+            } else {
+                z_lo     = new_z(i,j,0);
+                Real t_0 = 290.0; // WRF's model_config_rec%base_temp
+                Real a   = 50.0;  // WRF's model_config_rec%base_lapse
+                psurf = p_0*exp(-t_0/a+std::pow((std::pow(t_0/a, 2.)-2.0*grav*z_lo/(a*R_d)), 0.5));
             }
-            z_vec[kmax+1] = new_z(i,j,kmax+1);
 
-            calc_rho_p(kmax,
-                       metgrid_debug_psfc, flag_psfc_vec[0], orig_psfc(i,j,0),
-                       grav, Thetad_vec, Thetam_vec, Q_vec, z_vec,
-                       Rhom_vec, Pm_vec);
+            // Iterations for the first CC point that is 1/2 dz off the surface
+            {
+                z_lo      = new_z(i,j,0);
+                qv_lo     = (use_moisture) ? new_data(i,j,0,RhoQ_comp) : 0.0;
+                rd_lo     = 0.0; // initial guess
+                thetad_lo = new_data(i,j,0,RhoTheta_comp);
+                Real half_dz = z_lo;
+                Real qvf     = 1.0+(R_v/R_d)*qv_lo;
+                Real thetam  = thetad_lo*qvf;
+                for (int it(0); it<maxiter; it++) {
+                    p_lo = psurf-half_dz*rd_lo*(1.0+qv_lo)*grav;
+                    if (p_lo < 0.0) p_lo = 0.0;
+                    rd_lo = (p_0/(R_d*thetam))*std::pow(p_lo/p_0, iGamma);
+                } // it
+                p_hse_arr(i,j,0) =  p_lo;
+                r_hse_arr(i,j,0) = rd_lo;
+            }
 
-            for (int k=0; k<=kmax; k++) {
-                p_hse_arr(i,j,k) =   Pm_vec[k];
-                r_hse_arr(i,j,k) = Rhom_vec[k];
+            // Iterations for k \in [1 kmax]
+            for (int k(1); k<=kmax; k++) {
+                // Known hi data
+                z_hi  = new_z(i,j,k);
+                qv_hi = (use_moisture) ? new_data(i,j,k,RhoQ_comp) : 0.0;
+                thetad_hi = new_data(i,j,k,RhoTheta_comp);
+
+                // Initial guesses for hi data
+                 p_hi = p_lo;
+                rd_hi = getRhogivenThetaPress(thetad_hi,
+                                              p_hi,
+                                              R_d/Cp_d,
+                                              qv_hi);
+
+                // Vertical grid spacing
+                Real dz = z_hi - z_lo;
+
+                // Establish known constant
+                Real rho_tot_lo = rd_lo * (1. + qv_lo);
+                Real C = -p_lo + 0.5*rho_tot_lo*grav*dz;
+
+                // Initial residual
+                Real rho_tot_hi = rd_hi * (1. + qv_hi);
+                Real F = p_hi + 0.5*rho_tot_hi*grav*dz + C;
+
+                // Do iterations
+                if (std::abs(F)>tol) HSEutils::Newton_Raphson_hse(tol, R_d/Cp_d, dz,
+                                                                  grav, C, thetad_hi,
+                                                                  qv_hi, qv_hi, p_hi,
+                                                                  rd_hi, F);
+
+                // Copy solution to base state
+                p_hse_arr(i,j,k) =  p_hi;
+                r_hse_arr(i,j,k) = rd_hi;
+
+                // Copy hi to lo
+                z_lo  = z_hi;
+                p_lo  = p_hi;
+                qv_lo = qv_hi;
+                rd_lo = rd_hi;
+                thetad_lo = thetad_hi;
             }
         });
 
@@ -1066,13 +1132,13 @@ init_base_state_from_metgrid (const bool use_moisture,
         {
             // Multiply by Rho to get conserved vars
             Real Qv = 0.0;
-            new_data(i,j,k,Rho_comp) = r_hse_arr(i,j,k);
+            new_data(i,j,k,Rho_comp)       = r_hse_arr(i,j,k);
             new_data(i,j,k,RhoTheta_comp) *= r_hse_arr(i,j,k);
-            if (use_moisture){
+            if (use_moisture) {
                 Qv = new_data(i,j,k,RhoQ_comp);
                 new_data(i,j,k,RhoQ_comp) *= r_hse_arr(i,j,k);
             }
-            for (int n = 0; n < NSCALARS; n++) {
+            for (int n(0); n < NSCALARS; n++) {
                 new_data(i,j,k,RhoScalar_comp+n) = 0.0;
             }
 
@@ -1132,41 +1198,109 @@ init_base_state_from_metgrid (const bool use_moisture,
     }
 
     int ntimes = NC_psfc_fab.size();
-    for (int it=0; it<ntimes; it++) {
+    for (int itime(0); itime<ntimes; itime++) {
         FArrayBox p_hse_bcs_fab;
         FArrayBox pi_hse_bcs_fab;
         p_hse_bcs_fab.resize(state_fab.box(), 1, Arena_Used);
+        auto psfc_flag = flag_psfc_vec[itime];
 
         // ********************************************************
         // calculate density and pressure for boundary conditions.
         // ********************************************************
         Box valid_bx2d = valid_bx;
         valid_bx2d.setRange(2,0);
-        auto const orig_psfc = NC_psfc_fab[it].const_array();
+        auto const orig_psfc = NC_psfc_fab[itime].const_array();
         auto const     new_z = z_phys_cc_fab.const_array();
-        auto       Theta_arr = fabs_for_bcs[it][MetGridBdyVars::T].array();
-        auto           Q_arr = (use_moisture ) ? fabs_for_bcs[it][MetGridBdyVars::QV].array() : Array4<Real>{};
+        auto       Theta_arr = fabs_for_bcs[itime][MetGridBdyVars::T].array();
+        auto           Q_arr = (use_moisture ) ? fabs_for_bcs[itime][MetGridBdyVars::QV].array() : Array4<Real>{};
         auto       p_hse_arr = p_hse_bcs_fab.array();
 
         ParallelFor(valid_bx2d, [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
         {
-            for (int k=0; k<=kmax; k++) {
-                     z_vec[k] = new_z(i,j,k);
-                Thetad_vec[k] = Theta_arr(i,j,k);
-                Q_vec[k] = (use_moisture) ? Q_arr(i,j,k) : 0.0;
+            const int maxiter = 10;
+            const amrex::Real tol = 1.0e-10;
+
+            // Low and Hi column variables
+            Real psurf;
+            Real z_lo,   z_hi;
+            Real p_lo,   p_hi;
+            Real qv_lo, qv_hi;
+            Real rd_lo, rd_hi;
+            Real thetad_lo, thetad_hi;
+
+            // Calculate or use pressure at the surface.
+            if (metgrid_debug_psfc) {
+                psurf = std::pow(10, 5);
+            } else if (psfc_flag  == 1) {
+                psurf = orig_psfc(i,j,0);
+            } else {
+                z_lo     = new_z(i,j,0);
+                Real t_0 = 290.0; // WRF's model_config_rec%base_temp
+                Real a   = 50.0;  // WRF's model_config_rec%base_lapse
+                psurf = p_0*exp(-t_0/a+std::pow((std::pow(t_0/a, 2.)-2.0*grav*z_lo/(a*R_d)), 0.5));
             }
-            z_vec[kmax+1] = new_z(i,j,kmax+1);
 
-            calc_rho_p(kmax,
-                       metgrid_debug_psfc, flag_psfc_vec[it], orig_psfc(i,j,0),
-                       grav, Thetad_vec, Thetam_vec, Q_vec, z_vec,
-                       Rhom_vec, Pm_vec);
+            // Iterations for the first CC point that is 1/2 dz off the surface
+            {
+                z_lo      = new_z(i,j,0);
+                qv_lo     = (use_moisture) ? Q_arr(i,j,0) : 0.0;
+                rd_lo     = 0.0; // initial guess
+                thetad_lo = Theta_arr(i,j,0);
+                Real half_dz = z_lo;
+                Real qvf     = 1.0+(R_v/R_d)*qv_lo;
+                Real thetam  = thetad_lo*qvf;
+                for (int it(0); it<maxiter; it++) {
+                    p_lo = psurf-half_dz*rd_lo*(1.0+qv_lo)*grav;
+                    if (p_lo < 0.0) p_lo = 0.0;
+                    rd_lo = (p_0/(R_d*thetam))*std::pow(p_lo/p_0, iGamma);
+                } // it
+                p_hse_arr(i,j,0) =  p_lo;
+            }
+
+            // Iterations for k \in [1 kmax]
+            for (int k(1); k<=kmax; k++) {
+                // Known hi data
+                z_hi  = new_z(i,j,k);
+                qv_hi = (use_moisture) ? Q_arr(i,j,k) : 0.0;
+                thetad_hi = Theta_arr(i,j,k);
+
+                // Initial guesses for hi data
+                 p_hi = p_lo;
+                rd_hi = getRhogivenThetaPress(thetad_hi,
+                                              p_hi,
+                                              R_d/Cp_d,
+                                              qv_hi);
+
+                // Vertical grid spacing
+                Real dz = z_hi - z_lo;
+
+                // Establish known constant
+                Real rho_tot_lo = rd_lo * (1. + qv_lo);
+                Real C = -p_lo + 0.5*rho_tot_lo*grav*dz;
+
+                // Initial residual
+                Real rho_tot_hi = rd_hi * (1. + qv_hi);
+                Real F = p_hi + 0.5*rho_tot_hi*grav*dz + C;
+
+                // Do iterations
+                if (std::abs(F)>tol) HSEutils::Newton_Raphson_hse(tol, R_d/Cp_d, dz,
+                                                                  grav, C, thetad_hi,
+                                                                  qv_hi, qv_hi, p_hi,
+                                                                  rd_hi, F);
+
+                // Copy solution to base state
+                p_hse_arr(i,j,k) =  p_hi;
+
+                // Copy hi to lo
+                z_lo  = z_hi;
+                p_lo  = p_hi;
+                qv_lo = qv_hi;
+                rd_lo = rd_hi;
+                thetad_lo = thetad_hi;
+            }
 
-            for (int k=0; k<=kmax; k++) {
-                p_hse_arr(i,j,k) = Pm_vec[k];
-            } // k
         });
-    } // it
+    } // itime
 }
 
 
@@ -1193,7 +1327,7 @@ init_msfs_from_metgrid (const bool metgrid_debug_msf,
 {
 //    int ntimes = NC_MSFU_fab.size();
     int ntimes = 1;
-    for (int it = 0; it < ntimes; it++) {
+    for (int itime(0); itime < ntimes; itime++) {
         //
         // FArrayBox to FArrayBox copy does "copy on intersection"
         // This only works here because we have broadcast the FArrayBox of data from the netcdf file to all ranks
@@ -1201,9 +1335,9 @@ init_msfs_from_metgrid (const bool metgrid_debug_msf,
 
         // This copies or sets mapfac_m
         if ((flag_msf == 1) and (!metgrid_debug_msf)) {
-            msfm_fab.template copy<RunOn::Device>(NC_MSFM_fab[it]);
-            msfu_fab.template copy<RunOn::Device>(NC_MSFU_fab[it]);
-            msfv_fab.template copy<RunOn::Device>(NC_MSFV_fab[it]);
+            msfm_fab.template copy<RunOn::Device>(NC_MSFM_fab[itime]);
+            msfu_fab.template copy<RunOn::Device>(NC_MSFU_fab[itime]);
+            msfv_fab.template copy<RunOn::Device>(NC_MSFV_fab[itime]);
         } else {
 #ifndef AMREX_USE_GPU
             Print() << " map factors are not present in met_em files. Setting to 1.0" << std::endl;
@@ -1212,6 +1346,6 @@ init_msfs_from_metgrid (const bool metgrid_debug_msf,
             msfu_fab.template setVal<RunOn::Device>(1.0);
             msfv_fab.template setVal<RunOn::Device>(1.0);
         }
-    } // it
+    } // itime
 }
 #endif // ERF_USE_NETCDF
diff --git a/Source/Initialization/ERF_init_from_wrfinput.cpp b/Source/Initialization/ERF_init_from_wrfinput.cpp
index 7c3a1d062..3e9cef97e 100644
--- a/Source/Initialization/ERF_init_from_wrfinput.cpp
+++ b/Source/Initialization/ERF_init_from_wrfinput.cpp
@@ -277,7 +277,7 @@ ERF::init_from_wrfinput (int lev)
     IntVect ng = p_hse.nGrowVect();
     const Real l_rdOcp = solverChoice.rdOcp;
 
-    if (init_type == "real") {
+    if (init_type == InitType::Real) {
         for ( MFIter mfi(lev_new[Vars::cons], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             FArrayBox&   cons_fab = lev_new[Vars::cons][mfi];
@@ -297,7 +297,7 @@ ERF::init_from_wrfinput (int lev)
         pi_hse.FillBoundary(geom[lev].periodicity());
     }
 
-    if (init_type == "real" && (lev == 0)) {
+    if (init_type == InitType::Real && (lev == 0)) {
         if (nc_bdy_file.empty()) {
             amrex::Error("NetCDF boundary file name must be provided via input");
         }
diff --git a/Source/Initialization/ERF_init_windfarm.cpp b/Source/Initialization/ERF_init_windfarm.cpp
index bf7d3dcc2..bc41acf57 100644
--- a/Source/Initialization/ERF_init_windfarm.cpp
+++ b/Source/Initialization/ERF_init_windfarm.cpp
@@ -43,8 +43,10 @@ ERF::init_windfarm (int lev)
     }
 
     if(solverChoice.windfarm_type == WindFarmType::GeneralAD) {
-        //windfarm->read_windfarm_blade_table(solverChoice.windfarm_blade_table);
-        //windfarm->read_airfoil_tables
+        windfarm->read_windfarm_blade_table(solverChoice.windfarm_blade_table);
+        windfarm->read_windfarm_airfoil_tables(solverChoice.windfarm_airfoil_tables,
+                                               solverChoice.windfarm_blade_table);
+        windfarm->read_windfarm_spec_table_extra(solverChoice.windfarm_spec_table_extra);
     }
 }
 
diff --git a/Source/LandSurfaceModel/ERF_LandSurface.H b/Source/LandSurfaceModel/ERF_LandSurface.H
index 9db80be66..56ad3b2ea 100644
--- a/Source/LandSurfaceModel/ERF_LandSurface.H
+++ b/Source/LandSurfaceModel/ERF_LandSurface.H
@@ -7,6 +7,10 @@
 #include <ERF_SLM.H>
 #include <ERF_MM5.H>
 
+#if ERF_USE_NOAH
+#include <ERF_NOAH.H>
+#endif
+
 class LandSurface {
 
 public:
diff --git a/Source/LandSurfaceModel/NOAH/ERF_NOAH.H b/Source/LandSurfaceModel/NOAH/ERF_NOAH.H
new file mode 100644
index 000000000..962c2dd2b
--- /dev/null
+++ b/Source/LandSurfaceModel/NOAH/ERF_NOAH.H
@@ -0,0 +1,41 @@
+#ifndef NOAH_H
+#define NOAH_H
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include <AMReX_FArrayBox.H>
+#include <AMReX_Geometry.H>
+#include <AMReX_MultiFabUtil.H>
+
+#include <ERF_NullSurf.H>
+#include <ERF_Constants.H>
+#include <ERF_IndexDefines.H>
+#include <ERF_DataStruct.H>
+
+// External include from the noahmp library
+#include <NoahmpIO.H>
+
+class NOAH : public NullSurf {
+public:
+    // Constructor
+    NOAH () {}
+
+    // Destructor
+    virtual ~NOAH () = default;
+
+    // Initialize data structures
+    void
+    Init (const amrex::MultiFab& cons_in,
+          const amrex::Geometry& geom,
+          const amrex::Real& dt) override;
+
+
+private:
+
+   // C++ variable for NoahmpIO struct
+   NoahmpIO_struct noahmpio;
+
+};
+#endif
diff --git a/Source/LandSurfaceModel/NOAH/ERF_NOAH.cpp b/Source/LandSurfaceModel/NOAH/ERF_NOAH.cpp
new file mode 100644
index 000000000..3e4acf206
--- /dev/null
+++ b/Source/LandSurfaceModel/NOAH/ERF_NOAH.cpp
@@ -0,0 +1,30 @@
+
+#include<iostream>
+
+#include <AMReX_Print.H>
+#include <ERF_NOAH.H>
+
+using namespace amrex;
+
+/* Initialize lsm data structures */
+void
+NOAH::Init (const MultiFab& cons_in,
+            const Geometry& geom,
+            const Real& dt)
+{
+    // Initialize Noahmp IO
+    amrex::Print() << "Initializing Noahmp IO" << std::endl;
+
+    /*
+     * noahmpio.xstart = 1;
+     * noahmpio.xend = 4;
+     * noahmpio.ystart = 1;
+     * noahmpio.yend = 2;
+     *
+     */
+
+    NoahmpIOVarInitDefault(&noahmpio);
+    NoahmpInitMain(&noahmpio);
+
+    amrex::Print() << "Noahmp IO Initialized" << std::endl;
+};
diff --git a/Source/LandSurfaceModel/NOAH/Make.package b/Source/LandSurfaceModel/NOAH/Make.package
new file mode 100644
index 000000000..4f1e6f483
--- /dev/null
+++ b/Source/LandSurfaceModel/NOAH/Make.package
@@ -0,0 +1,2 @@
+CEXE_sources += ERF_NOAH.cpp
+CEXE_headers += ERF_NOAH.H
diff --git a/Source/Microphysics/ERF_EulerianMicrophysics.H b/Source/Microphysics/ERF_EulerianMicrophysics.H
index 9892a30fe..cb91854b4 100644
--- a/Source/Microphysics/ERF_EulerianMicrophysics.H
+++ b/Source/Microphysics/ERF_EulerianMicrophysics.H
@@ -31,14 +31,11 @@ public:
             a_model_type == MoistureType::SAM_NoIce ||
             a_model_type == MoistureType::SAM_NoPrecip_NoIce) {
             SetModel<SAM>();
-            amrex::Print() << "SAM moisture model!\n";
         } else if (a_model_type == MoistureType::Kessler ||
                    a_model_type == MoistureType::Kessler_NoRain) {
             SetModel<Kessler>();
-            amrex::Print() << "Kessler moisture model!\n";
         } else if (a_model_type == MoistureType::None) {
             SetModel<NullMoist>();
-            amrex::Print() << "No moisture model!\n";
         } else {
             amrex::Abort("EulerianMicrophysics: Dont know this moisture_type!") ;
         }
diff --git a/Source/PBL/ERF_ComputeDiffusivityYSU.cpp b/Source/PBL/ERF_ComputeDiffusivityYSU.cpp
index 3a9c92839..92503e5f7 100644
--- a/Source/PBL/ERF_ComputeDiffusivityYSU.cpp
+++ b/Source/PBL/ERF_ComputeDiffusivityYSU.cpp
@@ -121,7 +121,8 @@ ComputeDiffusivityYSU (const MultiFab& xvel,
                 while (!above_critical and bx.contains(i,j,kpbl+1)) {
                     kpbl += 1;
                     const Real zval = use_terrain ? Compute_Zrel_AtCellCenter(i,j,kpbl,z_nd_arr) : gdata.ProbLo(2) + (kpbl + 0.5)*gdata.CellSize(2);
-                    const Real ws2_level = 0.25*((uvel(i,j,kpbl)+uvel(i+1,j,kpbl))*(uvel(i,j,kpbl)+uvel(i+1,j,kpbl)) + (vvel(i,j,kpbl)+vvel(i,j+1,kpbl))*(uvel(i,j,kpbl)+uvel(i,j+1,kpbl)));
+                    const Real ws2_level = 0.25*( (uvel(i,j,kpbl)+uvel(i+1,j  ,kpbl))*(uvel(i,j,kpbl)+uvel(i+1,j  ,kpbl))
+                                                + (vvel(i,j,kpbl)+vvel(i  ,j+1,kpbl))*(vvel(i,j,kpbl)+vvel(i  ,j+1,kpbl)) );
                     const Real theta = cell_data(i,j,kpbl,RhoTheta_comp) / cell_data(i,j,kpbl,Rho_comp);
                     Rib_dn = Rib_up;
                     Rib_up = (theta-base_theta)/base_theta * CONST_GRAV * zval / ws2_level;
diff --git a/Source/SourceTerms/ERF_make_buoyancy.cpp b/Source/SourceTerms/ERF_make_buoyancy.cpp
index 4cb518a29..d9ec30a53 100644
--- a/Source/SourceTerms/ERF_make_buoyancy.cpp
+++ b/Source/SourceTerms/ERF_make_buoyancy.cpp
@@ -92,7 +92,6 @@ void make_buoyancy (Vector<MultiFab>& S_data,
         // ******************************************************************************************
         if (solverChoice.moisture_type == MoistureType::None)
         {
-
           if (solverChoice.buoyancy_type == 1) {
 #ifdef _OPENMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
@@ -119,11 +118,14 @@ void make_buoyancy (Vector<MultiFab>& S_data,
             } // mfi
 
           }
-#if 0
-          else
+          else // (buoyancy_type != 1)
           {
-            // We use the base state rather than planar average because we don't want to average over
-            // the limited region of the fine level
+            // We now use the base state rather than planar average because
+            //     1) we don't want to average over the limited region of the fine level if doing multilevel.
+            //     2) it's cheaper to use the base state than to compute the horizontal averages
+            //     3) when running in a smallish domain, the horizontal average may evolve over time,
+            //        which is not necessarily the intended behavior
+            //
 #ifdef _OPENMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
 #endif
@@ -147,7 +149,7 @@ void make_buoyancy (Vector<MultiFab>& S_data,
                     Real rt0_hi = getRhoThetagivenP(p0_arr(i,j,k));
                     Real  t0_hi = getTgivenPandTh(p0_arr(i,j,k), rt0_hi/r0_arr(i,j,k), rd_over_cp);
                     Real   t_hi = getTgivenRandRTh(cell_data(i,j,k  ,Rho_comp), cell_data(i,j,k  ,RhoTheta_comp));
-                    Real qplus   = (t_hi-t0_hi)/t0_hi;
+                    Real qplus  = (t_hi-t0_hi)/t0_hi;
 
                     Real rt0_lo = getRhoThetagivenP(p0_arr(i,j,k-1));
                     Real  t0_lo = getTgivenPandTh(p0_arr(i,j,k-1), rt0_lo/r0_arr(i,j,k-1), rd_over_cp);
@@ -158,61 +160,6 @@ void make_buoyancy (Vector<MultiFab>& S_data,
                     buoyancy_fab(i, j, k) = -r0_q_avg * grav_gpu[2];
                 });
             } // mfi
-          }
-#else
-          else if (solverChoice.buoyancy_type == 2 || solverChoice.buoyancy_type == 3)
-          {
-            PlaneAverage state_ave(&(S_data[IntVars::cons]), geom, solverChoice.ave_plane);
-            PlaneAverage prim_ave(&S_prim, geom, solverChoice.ave_plane);
-
-            int ncell = state_ave.ncell_line();
-
-            state_ave.compute_averages(ZDir(), state_ave.field());
-            prim_ave.compute_averages(ZDir(), prim_ave.field());
-
-            Gpu::HostVector<Real> rho_h(ncell), theta_h(ncell);
-            state_ave.line_average(Rho_comp, rho_h);
-            prim_ave.line_average(PrimTheta_comp, theta_h);
-
-            Gpu::DeviceVector<Real>   rho_d(ncell);
-            Gpu::DeviceVector<Real> theta_d(ncell);
-
-            Gpu::copyAsync(Gpu::hostToDevice, rho_h.begin(), rho_h.end(), rho_d.begin());
-            Gpu::copyAsync(Gpu::hostToDevice, theta_h.begin(), theta_h.end(), theta_d.begin());
-
-            Real*   rho_d_ptr =   rho_d.data();
-            Real* theta_d_ptr = theta_d.data();
-
-#ifdef _OPENMP
-#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
-#endif
-            for ( MFIter mfi(buoyancy,TilingIfNotGPU()); mfi.isValid(); ++mfi)
-            {
-                Box tbz = mfi.tilebox();
-
-                // We don't compute a source term for z-momentum on the bottom or top boundary
-                if (tbz.smallEnd(2) == klo) tbz.growLo(2,-1);
-                if (tbz.bigEnd(2)   == khi) tbz.growHi(2,-1);
-
-                const Array4<const Real> & cell_data  = S_data[IntVars::cons].array(mfi);
-                const Array4<      Real> & buoyancy_fab = buoyancy.array(mfi);
-
-                ParallelFor(tbz, [=] AMREX_GPU_DEVICE (int i, int j, int k)
-                {
-                    Real tempp1d = getTgivenRandRTh(rho_d_ptr[k  ], rho_d_ptr[k  ]*theta_d_ptr[k  ]);
-                    Real tempp3d = getTgivenRandRTh(cell_data(i,j,k  ,Rho_comp), cell_data(i,j,k  ,RhoTheta_comp));
-                    Real qplus   = (tempp3d-tempp1d)/tempp1d;
-
-                    Real tempm1d = getTgivenRandRTh(rho_d_ptr[k-1], rho_d_ptr[k-1]*theta_d_ptr[k-1]);
-                    Real tempm3d = getTgivenRandRTh(cell_data(i,j,k-1,Rho_comp), cell_data(i,j,k-1,RhoTheta_comp));
-                    Real qminus  = (tempm3d-tempm1d)/tempm1d;
-
-                    Real r0_q_avg = Real(0.5) * (rho_d_ptr[k]*qplus + rho_d_ptr[k-1]*qminus);
-
-                    buoyancy_fab(i, j, k) = -r0_q_avg * grav_gpu[2];
-                });
-            } // mfi
-#endif
           } // buoyancy_type
         } // moisture type
         else
@@ -224,7 +171,7 @@ void make_buoyancy (Vector<MultiFab>& S_data,
           if ( (solverChoice.moisture_type == MoistureType::Kessler_NoRain) ||
                (solverChoice.moisture_type == MoistureType::SAM)            ||
                (solverChoice.moisture_type == MoistureType::SAM_NoPrecip_NoIce) )
-           {
+          {
               AMREX_ALWAYS_ASSERT(solverChoice.buoyancy_type == 1);
           }
 
@@ -269,7 +216,7 @@ void make_buoyancy (Vector<MultiFab>& S_data,
 
             // Compute horizontal averages of all components of each field
             state_ave.compute_averages(ZDir(), state_ave.field());
-             prim_ave.compute_averages(ZDir(), prim_ave.field());
+            prim_ave.compute_averages(ZDir(), prim_ave.field());
 
             int ncell = state_ave.ncell_line();
 
@@ -290,7 +237,7 @@ void make_buoyancy (Vector<MultiFab>& S_data,
             Gpu::DeviceVector<Real> qv_d(ncell,0.0), qc_d(ncell,0.0), qp_d(ncell,0.0);
             if (n_qstate >=1) {
                 prim_ave.line_average(PrimQ1_comp, qv_h);
-                Gpu::copyAsync(Gpu::hostToDevice,  qv_h.begin(), qv_h.end(), qv_d.begin());
+               Gpu::copyAsync(Gpu::hostToDevice,  qv_h.begin(), qv_h.end(), qv_d.begin());
             }
             if (n_qstate >=2) {
                 prim_ave.line_average(PrimQ2_comp, qc_h);
@@ -305,6 +252,7 @@ void make_buoyancy (Vector<MultiFab>& S_data,
             Real* qp_d_ptr = qp_d.data();
 
             if (solverChoice.buoyancy_type == 2 || solverChoice.buoyancy_type == 4 ) {
+
 #ifdef _OPENMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
 #endif
diff --git a/Source/TimeIntegration/ERF_Advance.cpp b/Source/TimeIntegration/ERF_Advance.cpp
index c5c4ca369..db7cbf1a0 100644
--- a/Source/TimeIntegration/ERF_Advance.cpp
+++ b/Source/TimeIntegration/ERF_Advance.cpp
@@ -92,14 +92,20 @@ ERF::Advance (int lev, Real time, Real dt_lev, int iteration, int /*ncycle*/)
     V_new.setVal(1.e34,V_new.nGrowVect());
     W_new.setVal(1.e34,W_new.nGrowVect());
 
+    //
+    // NOTE: the momenta here are not fillpatched (they are only used as scratch space)
+    //
     FillPatch(lev, time, {&S_old, &U_old, &V_old, &W_old},
                          {&S_old, &rU_old[lev], &rV_old[lev], &rW_old[lev]});
-
-    if (solverChoice.moisture_type != MoistureType::None) {
-        // TODO: This is only qv
-        if (qmoist[lev].size() > 0) FillPatchMoistVars(lev, *(qmoist[lev][0]));
-    }
-
+    //
+    // So we must convert the fillpatched to momenta, including the ghost values
+    //
+    VelocityToMomentum(U_old, rU_old[lev].nGrowVect(),
+                       V_old, rV_old[lev].nGrowVect(),
+                       W_old, rW_old[lev].nGrowVect(),
+                       S_old, rU_old[lev], rV_old[lev], rW_old[lev],
+                       Geom(lev).Domain(),
+                       domain_bcs_type);
 
 #if defined(ERF_USE_WINDFARM)
     if (solverChoice.windfarm_type != WindFarmType::None) {
diff --git a/Source/TimeIntegration/ERF_MRI.H b/Source/TimeIntegration/ERF_MRI.H
index 58302b1a9..0a38cc63c 100644
--- a/Source/TimeIntegration/ERF_MRI.H
+++ b/Source/TimeIntegration/ERF_MRI.H
@@ -273,6 +273,10 @@ public:
                     nsubsteps =             1;   dtau =     timestep;
                 } else {
                     nsubsteps = substep_ratio;   dtau = sub_timestep;
+
+                    // STRT HACK -- this hack can be used to approximate the no-substepping algorithm
+                    // nsubsteps = 1;   dtau = timestep;
+                    // END HACK
                 }
                 time_stage = time + timestep;
             }
diff --git a/Source/TimeIntegration/ERF_TI_fast_headers.H b/Source/TimeIntegration/ERF_TI_fast_headers.H
index 4504db371..5c470d9da 100644
--- a/Source/TimeIntegration/ERF_TI_fast_headers.H
+++ b/Source/TimeIntegration/ERF_TI_fast_headers.H
@@ -37,7 +37,8 @@ void erf_fast_rhs_N (int step, int nrk, int level, int finest_level,
                      std::unique_ptr<amrex::MultiFab>& mapfac_v,
                      amrex::YAFluxRegister* fr_as_crse,
                      amrex::YAFluxRegister* fr_as_fine,
-                     bool l_use_moisture, bool l_reflux);
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
 
 /**
  * Function for computing the fast RHS with fixed terrain
@@ -64,7 +65,8 @@ void erf_fast_rhs_T (int step, int nrk, int level, int finest_level,
                      std::unique_ptr<amrex::MultiFab>& mapfac_v,
                      amrex::YAFluxRegister* fr_as_crse,
                      amrex::YAFluxRegister* fr_as_fine,
-                     bool l_use_moisture, bool l_reflux);
+                     bool l_use_moisture, bool l_reflux,
+                     bool l_implicit_substepping);
 
 /**
  * Function for computing the fast RHS with moving terrain
@@ -98,7 +100,8 @@ void erf_fast_rhs_MT (int step, int nrk, int level, int finest_level,
                       std::unique_ptr<amrex::MultiFab>& mapfac_v,
                       amrex::YAFluxRegister* fr_as_crse,
                       amrex::YAFluxRegister* fr_as_fine,
-                      bool l_use_moisture, bool l_reflux);
+                      bool l_use_moisture, bool l_reflux,
+                      bool l_implicit_substepping);
 
 /**
  * Function for computing the coefficients for the tridiagonal solver used in the fast
diff --git a/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H b/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
index f61dba333..1213b8cf6 100644
--- a/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
+++ b/Source/TimeIntegration/ERF_TI_fast_rhs_fun.H
@@ -20,7 +20,13 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
         // Per p2902 of Klemp-Skamarock-Dudhia-2007
         // beta_s = -1.0 : fully explicit
         // beta_s =  1.0 : fully implicit
-        Real beta_s = 0.1;
+        Real beta_s;
+        if (solverChoice.substepping_type[level] == SubsteppingType::Implicit)
+        {
+            beta_s = 0.1;
+        } else { // Fully explicit
+            beta_s = -1.0;
+        }
 
         // *************************************************************************
         // Set up flux registers if using two_way coupling
@@ -99,7 +105,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
                                 mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
                 // If this is not the first substep we pass in S_data as the previous step's solution
                 erf_fast_rhs_MT(fast_step, nrk, level, finest_level,
@@ -111,7 +117,7 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                   detJ_cc[level],   detJ_cc_new[level],   detJ_cc_src[level],
                                 dtau, beta_s, inv_fac,
                                 mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                                fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         } else if (solverChoice.use_terrain && solverChoice.terrain_type == TerrainType::Static) {
             if (fast_step == 0) {
@@ -122,20 +128,22 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                  detJ_cc[level], r0, pi0, dtau, beta_s, phys_bc_type);
 
                 // If this is the first substep we pass in S_old as the previous step's solution
+                //    and S_data is the new-time solution to be defined here
                 erf_fast_rhs_T(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
                                S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
-                // If this is not the first substep we pass in S_data as the previous step's solution
+                // If this is not the first substep we pass in S_data as both the previous step's solution
+                //    and as the new-time solution to be defined here
                 erf_fast_rhs_T(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
                                S_data, S_scratch, fine_geom, solverChoice.gravity, Omega,
                                z_phys_nd[level], detJ_cc[level], dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         } else {
             if (fast_step == 0) {
@@ -146,20 +154,22 @@ auto fast_rhs_fun = [&](int fast_step, int /*n_sub*/, int nrk,
                                  detJ_cc[level], r0, pi0, dtau, beta_s, phys_bc_type);
 
                 // If this is the first substep we pass in S_old as the previous step's solution
+                //    and S_data is the new-time solution to be defined here
                 erf_fast_rhs_N(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_old, S_stage, S_prim, pi_stage, fast_coeffs,
                                S_data, S_scratch, fine_geom, solverChoice.gravity,
                                dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             } else {
-                // If this is not the first substep we pass in S_data as the previous step's solution
+                // If this is not the first substep we pass in S_data as both the previous step's solution
+                //    and as the new-time solution to be defined here
                 erf_fast_rhs_N(fast_step, nrk, level, finest_level,
                                S_slow_rhs, S_data, S_stage, S_prim, pi_stage, fast_coeffs,
                                S_data, S_scratch, fine_geom, solverChoice.gravity,
                                dtau, beta_s, inv_fac,
                                mapfac_m[level], mapfac_u[level], mapfac_v[level],
-                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux);
+                               fr_as_crse, fr_as_fine, l_use_moisture, l_reflux, l_implicit_substepping);
             }
         }
 
diff --git a/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H b/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
index c9f963c68..994379325 100644
--- a/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
+++ b/Source/TimeIntegration/ERF_TI_slow_rhs_fun.H
@@ -249,8 +249,7 @@
         // Populate RHS for relaxation zones if using real bcs
         if (use_real_bcs && (level == 0)) {
             if (real_width>0) {
-                realbdy_compute_interior_ghost_rhs(init_type,
-                                                   bdy_time_interval, start_bdy_time, new_stage_time, slow_dt,
+                realbdy_compute_interior_ghost_rhs(bdy_time_interval, start_bdy_time, new_stage_time, slow_dt,
                                                    real_width, real_set_width, fine_geom,
                                                    S_rhs, S_old, S_data,
                                                    bdy_data_xlo, bdy_data_xhi,
@@ -459,8 +458,7 @@
         // Populate RHS for relaxation zones if using real bcs
         if (use_real_bcs && (level == 0)) {
             if (real_width>0) {
-                    realbdy_compute_interior_ghost_rhs(init_type,
-                                                       bdy_time_interval, start_bdy_time, new_stage_time, slow_dt,
+                    realbdy_compute_interior_ghost_rhs(bdy_time_interval, start_bdy_time, new_stage_time, slow_dt,
                                                        real_width, real_set_width, fine_geom,
                                                        S_rhs, S_old, S_data,
                                                        bdy_data_xlo, bdy_data_xhi,
diff --git a/Source/TimeIntegration/ERF_TI_utils.H b/Source/TimeIntegration/ERF_TI_utils.H
index 21c330bc9..7870508a4 100644
--- a/Source/TimeIntegration/ERF_TI_utils.H
+++ b/Source/TimeIntegration/ERF_TI_utils.H
@@ -15,19 +15,32 @@
           const Box& gbx = mfi.growntilebox(ng);
           const Array4<const Real>& cons_arr     = cons_state.array(mfi);
           const Array4<      Real>& prim_arr     = S_prim.array(mfi);
-          const Array4<      Real>& pi_stage_arr = pi_stage.array(mfi);
-          const Real rdOcp = solverChoice.rdOcp;
 
+          //
+          // We may need > one ghost cells of prim in order to compute higher order advective terms
+          //
           amrex::ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
           {
             Real rho       = cons_arr(i,j,k,Rho_comp);
             Real rho_theta = cons_arr(i,j,k,RhoTheta_comp);
             prim_arr(i,j,k,PrimTheta_comp) = rho_theta / rho;
-            pi_stage_arr(i,j,k) = getExnergivenRTh(rho_theta, rdOcp);
             for (int n = 1; n < ncomp_prim; ++n) {
               prim_arr(i,j,k,PrimTheta_comp + n) = cons_arr(i,j,k,RhoTheta_comp + n) / rho;
             }
           });
+
+          //
+          // We only use one ghost cell of pi_stage so we only fill one here
+          //
+          const Box& gbx1 = mfi.growntilebox(1);
+
+          const Array4<      Real>& pi_stage_arr = pi_stage.array(mfi);
+          const Real rdOcp = solverChoice.rdOcp;
+
+          amrex::ParallelFor(gbx1, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+          {
+            pi_stage_arr(i,j,k) = getExnergivenRTh(cons_arr(i,j,k,RhoTheta_comp), rdOcp);
+          });
       } // mfi
     };
 
diff --git a/Source/TimeIntegration/ERF_advance_dycore.cpp b/Source/TimeIntegration/ERF_advance_dycore.cpp
index 2872007e8..8c78fcfb3 100644
--- a/Source/TimeIntegration/ERF_advance_dycore.cpp
+++ b/Source/TimeIntegration/ERF_advance_dycore.cpp
@@ -90,6 +90,7 @@ void ERF::advance_dycore(int level,
     bool l_use_kturb   = ( (tc.les_type != LESType::None)   ||
                            (tc.pbl_type != PBLType::None) );
     bool l_use_moisture = ( solverChoice.moisture_type != MoistureType::None );
+    bool l_implicit_substepping = ( solverChoice.substepping_type[level] == SubsteppingType::Implicit );
 
     const bool use_most = (m_most != nullptr);
     const bool exp_most = (solverChoice.use_explicit_most);
@@ -252,9 +253,10 @@ void ERF::advance_dycore(int level,
     // This is an optimization since we won't need more than one ghost
     // cell of momentum in the integrator if not using NumDiff
     //
-    IntVect ngu = (solverChoice.use_NumDiff) ? IntVect(1,1,1) : xvel_old.nGrowVect();
-    IntVect ngv = (solverChoice.use_NumDiff) ? IntVect(1,1,1) : yvel_old.nGrowVect();
-    IntVect ngw = (solverChoice.use_NumDiff) ? IntVect(1,1,0) : zvel_old.nGrowVect();
+    IntVect ngu = (!solverChoice.use_NumDiff) ? IntVect(1,1,1) : xvel_old.nGrowVect();
+    IntVect ngv = (!solverChoice.use_NumDiff) ? IntVect(1,1,1) : yvel_old.nGrowVect();
+    IntVect ngw = (!solverChoice.use_NumDiff) ? IntVect(1,1,0) : zvel_old.nGrowVect();
+
     VelocityToMomentum(xvel_old, ngu, yvel_old, ngv, zvel_old, ngw, density,
                        state_old[IntVars::xmom],
                        state_old[IntVars::ymom],
diff --git a/Source/TimeIntegration/ERF_fast_rhs_MT.cpp b/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
index 1ff3b524b..1843a7d8f 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_MT.cpp
@@ -7,38 +7,40 @@ using namespace amrex;
  * Function for computing the fast RHS with moving terrain
  *
  * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stg_data solution            at previous RK stage
- * @param[in]    S_stg_prim primitive variables at previous RK stage
- * @param[in]    pi_stage   Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity Magnitude of gravity
- * @param[in]    use_lagged_delta_rt define lagged_delta_rt for our next step
- * @param[in]    Omega component of the momentum normal to the z-coordinate surface
- * @param[in]    z_t_rk rate of change of grid height -- only relevant for moving terrain
- * @param[in]    z_t_pert rate of change of grid height -- interpolated between RK stages
- * @param[in]    z_phys_nd_old height coordinate at nodes at old time
- * @param[in]    z_phys_nd_new height coordinate at nodes at new time
- * @param[in]    z_phys_nd_stg height coordinate at nodes at previous stage
- * @param[in]    detJ_cc_old Jacobian of the metric transformation at old time
- * @param[in]    detJ_cc_new Jacobian of the metric transformation at new time
- * @param[in]    detJ_cc_stg Jacobian of the metric transformation at previous stage
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
+ * @param[in   ] nrk   which Runge-Kutta step
+ * @param[in   ] level level of resolution
+ * @param[in   ] finest_level finest level of resolution
+ * @param[in   ] S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ] S_prev previous solution
+ * @param[in   ] S_stg_data solution            at previous RK stage
+ * @param[in   ] S_stg_prim primitive variables at previous RK stage
+ * @param[in   ] pi_stage   Exner function      at previous RK stage
+ * @param[in   ] fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out] S_data current solution
+ * @param[in   ] S_scratch scratch space
+ * @param[in   ] geom container for geometric information
+ * @param[in   ] gravity Magnitude of gravity
+ * @param[in   ] use_lagged_delta_rt define lagged_delta_rt for our next step
+ * @param[in   ] Omega component of the momentum normal to the z-coordinate surface
+ * @param[in   ] z_t_rk rate of change of grid height -- only relevant for moving terrain
+ * @param[in   ] z_t_pert rate of change of grid height -- interpolated between RK stages
+ * @param[in   ] z_phys_nd_old height coordinate at nodes at old time
+ * @param[in   ] z_phys_nd_new height coordinate at nodes at new time
+ * @param[in   ] z_phys_nd_stg height coordinate at nodes at previous stage
+ * @param[in   ] detJ_cc_old Jacobian of the metric transformation at old time
+ * @param[in   ] detJ_cc_new Jacobian of the metric transformation at new time
+ * @param[in   ] detJ_cc_stg Jacobian of the metric transformation at previous stage
+ * @param[in   ] dtau fast time step
+ * @param[in   ] beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ] facinv inverse factor for time-averaging the momenta
+ * @param[in   ] mapfac_m map factor at cell centers
+ * @param[in   ] mapfac_u map factor at x-faces
+ * @param[in   ] mapfac_v map factor at y-faces
  * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
  * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_use_moisture
+ * @param[in   ]  l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_implicit_substepping
  */
 
 void erf_fast_rhs_MT (int step, int nrk,
@@ -71,7 +73,8 @@ void erf_fast_rhs_MT (int step, int nrk,
                       YAFluxRegister* fr_as_crse,
                       YAFluxRegister* fr_as_fine,
                       bool l_use_moisture,
-                      bool l_reflux)
+                      bool l_reflux,
+                      bool /*l_implicit_substepping*/)
 {
     BL_PROFILE_REGION("erf_fast_rhs_MT()");
 
diff --git a/Source/TimeIntegration/ERF_fast_rhs_N.cpp b/Source/TimeIntegration/ERF_fast_rhs_N.cpp
index fec38e38e..5bafd34bf 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_N.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_N.cpp
@@ -6,39 +6,41 @@ using namespace amrex;
 /**
  * Function for computing the fast RHS with no terrain
  *
- * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stage_data solution            at previous RK stage
- * @param[in]    S_stage_prim primitive variables at previous RK stage
- * @param[in]    pi_stage   Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity magnitude of gravity
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
- * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
- * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  step  which fast time step within each Runge-Kutta step
+ * @param[in   ]  nrk   which Runge-Kutta step
+ * @param[in   ]  level level of resolution
+ * @param[in   ]  finest_level finest level of resolution
+ * @param[in   ]  S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ]  S_prev                           if step == 0, this is S_old, else the previous fast solution
+ * @param[in   ]  S_stage_data solution            at previous RK stage
+ * @param[in   ]  S_stage_prim primitive variables at previous RK stage
+ * @param[in   ]  pi_stage   Exner function      at previous RK stage
+ * @param[in   ]  fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out]  S_data    current solution
+ * @param[in   ]  S_scratch scratch space
+ * @param[in   ]  geom container for geometric information
+ * @param[in   ]  gravity magnitude of gravity
+ * @param[in   ]  dtau fast time step
+ * @param[in   ]  beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ]  facinv inverse factor for time-averaging the momenta
+ * @param[in   ]  mapfac_m map factor at cell centers
+ * @param[in   ]  mapfac_u map factor at x-faces
+ * @param[in   ]  mapfac_v map factor at y-faces
+ * @param[inout]  fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
+ * @param[inout]  fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
+ * @param[in   ]  l_use_moisture
+ * @param[in   ]  l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ]  l_implicit_substepping
  */
 
 void erf_fast_rhs_N (int step, int nrk,
                      int level, int finest_level,
                      Vector<MultiFab>& S_slow_rhs,                   // the slow RHS already computed
                      const Vector<MultiFab>& S_prev,                 // if step == 0, this is S_old, else the previous solution
-                     Vector<MultiFab>& S_stage_data,                 // S_bar = S^n, S^* or S^**
-                     const MultiFab& S_stage_prim,                   // Primitive version of S_stage_data[IntVars::cons]
-                     const MultiFab& pi_stage,                       // Exner function evaluated at last stage
-                     const MultiFab& fast_coeffs,                    // Coeffs for tridiagonal solve
+                     Vector<MultiFab>& S_stage_data,                 // S_stage = S^n, S^* or S^**
+                     const  MultiFab & S_stage_prim,                 // Primitive version of S_stage_data[IntVars::cons]
+                     const  MultiFab & pi_stage,                     // Exner function evaluated at last stage
+                     const  MultiFab &fast_coeffs,                   // Coeffs for tridiagonal solve
                      Vector<MultiFab>& S_data,                       // S_sum = most recent full solution
                      Vector<MultiFab>& S_scratch,                    // S_sum_old at most recent fast timestep for (rho theta)
                      const Geometry geom,
@@ -51,8 +53,13 @@ void erf_fast_rhs_N (int step, int nrk,
                      YAFluxRegister* fr_as_crse,
                      YAFluxRegister* fr_as_fine,
                      bool l_use_moisture,
-                     bool l_reflux)
+                     bool l_reflux,
+                     bool l_implicit_substepping)
 {
+    //
+    // NOTE: for step > 0, S_data and S_prev point to the same MultiFab data!!
+    //
+
     BL_PROFILE_REGION("erf_fast_rhs_N()");
 
     Real beta_1 = 0.5 * (1.0 - beta_s);  // multiplies explicit terms
@@ -71,9 +78,8 @@ void erf_fast_rhs_N (int step, int nrk,
     const auto& ba = S_stage_data[IntVars::cons].boxArray();
     const auto& dm = S_stage_data[IntVars::cons].DistributionMap();
 
-    MultiFab Delta_rho_w(    convert(ba,IntVect(0,0,1)), dm, 1, IntVect(1,1,0));
-    MultiFab Delta_rho  (            ba                , dm, 1, 1);
     MultiFab Delta_rho_theta(        ba                , dm, 1, 1);
+    MultiFab Delta_rho_w    (convert(ba,IntVect(0,0,1)), dm, 1, IntVect(1,1,0));
 
     MultiFab     coeff_A_mf(fast_coeffs, make_alias, 0, 1);
     MultiFab inv_coeff_B_mf(fast_coeffs, make_alias, 1, 1);
@@ -96,6 +102,9 @@ void erf_fast_rhs_N (int step, int nrk,
     MultiFab temp_cur_xmom(S_stage_data[IntVars::xmom].boxArray(),S_stage_data[IntVars::xmom].DistributionMap(),1,0);
     MultiFab temp_cur_ymom(S_stage_data[IntVars::ymom].boxArray(),S_stage_data[IntVars::ymom].DistributionMap(),1,0);
 
+    // We assume that in the first step (nrk == 0) we are only doing one substep.
+    AMREX_ALWAYS_ASSERT(nrk > 0 || step == 0);
+
     // *************************************************************************
     // First set up some arrays we'll need
     // *************************************************************************
@@ -105,61 +114,40 @@ void erf_fast_rhs_N (int step, int nrk,
 #endif
     for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
-        const Array4<Real>       & cur_cons  = S_data[IntVars::cons].array(mfi);
         const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
-        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
-        const Array4<Real>& lagged_delta_rt  = S_scratch[IntVars::cons].array(mfi);
-
-        const Array4<Real>& old_drho       = Delta_rho.array(mfi);
-        const Array4<Real>& old_drho_w     = Delta_rho_w.array(mfi);
-        const Array4<Real>& old_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<const Real>& prev_zmom =  S_prev[IntVars::zmom].const_array(mfi);
 
-        const Array4<const Real>&  prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
         const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
 
-        Box gbx = mfi.tilebox(); gbx.grow(1);
-
-        if (step == 0) {
-            ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-            {
-                cur_cons(i,j,k,Rho_comp)      = prev_cons(i,j,k,Rho_comp);
-                cur_cons(i,j,k,RhoTheta_comp) = prev_cons(i,j,k,RhoTheta_comp);
-            });
-        } // step = 0
-
-        Box gtbz = mfi.nodaltilebox(2);
-        gtbz.grow(IntVect(1,1,0));
-        ParallelFor(gtbz, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            old_drho_w(i,j,k) = prev_zmom(i,j,k) - stage_zmom(i,j,k);
-        });
-
+        const Array4<Real>& prev_drho_w     = Delta_rho_w.array(mfi);
+        const Array4<Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<Real>& lagged_delta_rt = S_scratch[IntVars::cons].array(mfi);
         const Array4<Real>& theta_extrap = extrap.array(mfi);
-        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            old_drho(i,j,k)       = cur_cons(i,j,k,Rho_comp)      - stage_cons(i,j,k,Rho_comp);
-            old_drho_theta(i,j,k) = cur_cons(i,j,k,RhoTheta_comp) - stage_cons(i,j,k,RhoTheta_comp);
+
+        Box gbx = mfi.growntilebox(1);
+        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            prev_drho_theta(i,j,k) = prev_cons(i,j,k,RhoTheta_comp) - stage_cons(i,j,k,RhoTheta_comp);
 
             if (step == 0) {
-                theta_extrap(i,j,k) = old_drho_theta(i,j,k);
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k);
             } else {
-                theta_extrap(i,j,k) = old_drho_theta(i,j,k) + beta_d *
-                  ( old_drho_theta(i,j,k) - lagged_delta_rt(i,j,k,RhoTheta_comp) );
+                theta_extrap(i,j,k) = prev_drho_theta(i,j,k) + beta_d *
+                  ( prev_drho_theta(i,j,k) - lagged_delta_rt(i,j,k,RhoTheta_comp) );
             }
-        });
-    } // mfi
-
-#ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-    for ( MFIter mfi(S_stage_data[IntVars::cons],TilingIfNotGPU()); mfi.isValid(); ++mfi)
-    {
-        // We define lagged_delta_rt for our next step as the current delta_rt
-        Box gbx = mfi.tilebox(); gbx.grow(1);
 
-        const Array4<Real>& lagged_delta_rt = S_scratch[IntVars::cons].array(mfi);
-        const Array4<Real>& old_drho_theta  = Delta_rho_theta.array(mfi);
+            // We define lagged_delta_rt for our next step as the current delta_rt
+            // (after using it above to extrapolate theta for this step)
+            lagged_delta_rt(i,j,k,RhoTheta_comp) = prev_drho_theta(i,j,k);
+        });
 
-        ParallelFor(gbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
-            lagged_delta_rt(i,j,k,RhoTheta_comp) = old_drho_theta(i,j,k);
+        // NOTE: We must do this here because for step > 0, prev_zmom and cur_zmom both point to the same data,
+        //       so by the time we would use prev_zmom to define zflux, it would have already been over-written.
+        Box gtbz = mfi.nodaltilebox(2);
+        gtbz.grow(IntVect(1,1,0));
+        ParallelFor(gtbz, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
+            prev_drho_w(i,j,k) = prev_zmom(i,j,k) - stage_zmom(i,j,k);
         });
     } // mfi
 
@@ -203,56 +191,69 @@ void erf_fast_rhs_N (int step, int nrk,
         // *********************************************************************
         // Define updates in the RHS of {x, y, z}-momentum equations
         // *********************************************************************
-        {
-        BL_PROFILE("fast_rhs_xymom");
-        ParallelFor(tbx, tby,
-        [=] AMREX_GPU_DEVICE (int i, int j, int k)
-        {
-            // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
-            Real gpx = (theta_extrap(i,j,k) - theta_extrap(i-1,j,k))*dxi;
-            gpx *= mf_u(i,j,0);
-
-            if (l_use_moisture) {
-                Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i-1,j,k,PrimQ1_comp)
-                                +prim(i,j,k,PrimQ2_comp) + prim(i-1,j,k,PrimQ2_comp) );
-                gpx /= (1.0 + q);
-            }
-
-            Real pi_c =  0.5 * (pi_stage_ca(i-1,j,k,0) + pi_stage_ca(i,j,k,0));
+        if (nrk == 0 and step == 0) {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k) + dtau * slow_rhs_rho_u(i,j,k);
+                avg_xmom(i,j,k) += facinv*new_drho_u;
+                temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k) + dtau * slow_rhs_rho_v(i,j,k);
+                avg_ymom(i,j,k) += facinv*new_drho_v;
+                temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
+            });
+        } else {
+            ParallelFor(tbx, tby,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpx = (theta_extrap(i,j,k) - theta_extrap(i-1,j,k))*dxi;
+                gpx *= mf_u(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i-1,j,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i-1,j,k,PrimQ2_comp) );
+                    gpx /= (1.0 + q);
+                }
 
-            Real fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
+                Real pi_c =  0.5 * (pi_stage_ca(i-1,j,k,0) + pi_stage_ca(i,j,k,0));
 
-            Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k)
-                + dtau * fast_rhs_rho_u + dtau * slow_rhs_rho_u(i,j,k);
+                Real fast_rhs_rho_u = -Gamma * R_d * pi_c * gpx;
 
-            avg_xmom(i,j,k) += facinv*new_drho_u;
+                Real new_drho_u = prev_xmom(i,j,k) - stage_xmom(i,j,k)
+                    + dtau * fast_rhs_rho_u + dtau * slow_rhs_rho_u(i,j,k);
 
-            temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
-        },
-        [=] AMREX_GPU_DEVICE (int i, int j, int k)
-        {
-            // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
-            Real gpy = (theta_extrap(i,j,k) - theta_extrap(i,j-1,k))*dyi;
-            gpy *= mf_v(i,j,0);
+                avg_xmom(i,j,k) += facinv*new_drho_u;
 
-            if (l_use_moisture) {
-                Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j-1,k,PrimQ1_comp)
-                                +prim(i,j,k,PrimQ2_comp) + prim(i,j-1,k,PrimQ2_comp) );
-                gpy /= (1.0 + q);
-            }
+                temp_cur_xmom_arr(i,j,k) = stage_xmom(i,j,k) + new_drho_u;
+            },
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                // Add (negative) gradient of (rho theta) multiplied by lagged "pi"
+                Real gpy = (theta_extrap(i,j,k) - theta_extrap(i,j-1,k))*dyi;
+                gpy *= mf_v(i,j,0);
+
+                if (l_use_moisture) {
+                    Real q = 0.5 * ( prim(i,j,k,PrimQ1_comp) + prim(i,j-1,k,PrimQ1_comp)
+                                    +prim(i,j,k,PrimQ2_comp) + prim(i,j-1,k,PrimQ2_comp) );
+                    gpy /= (1.0 + q);
+                }
 
-            Real pi_c =  0.5 * (pi_stage_ca(i,j-1,k,0) + pi_stage_ca(i,j,k,0));
+                Real pi_c =  0.5 * (pi_stage_ca(i,j-1,k,0) + pi_stage_ca(i,j,k,0));
 
-            Real fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
+                Real fast_rhs_rho_v = -Gamma * R_d * pi_c * gpy;
 
-            Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k)
-                 + dtau * fast_rhs_rho_v + dtau * slow_rhs_rho_v(i,j,k);
+                Real new_drho_v = prev_ymom(i,j,k) - stage_ymom(i,j,k)
+                     + dtau * fast_rhs_rho_v + dtau * slow_rhs_rho_v(i,j,k);
 
-            avg_ymom(i,j,k) += facinv*new_drho_v;
+                avg_ymom(i,j,k) += facinv*new_drho_v;
 
-            temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
-        });
-        } //profile
+                temp_cur_ymom_arr(i,j,k) = stage_ymom(i,j,k) + new_drho_v;
+            });
+        } // nrk > 0 and/or step > 0
     } //mfi
 
 #ifdef _OPENMP
@@ -268,24 +269,25 @@ void erf_fast_rhs_N (int step, int nrk,
         Box vbx = mfi.validbox();
         const auto& vbx_hi = ubound(vbx);
 
-        const Array4<const Real> & stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
-        const Array4<const Real> & stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
-        const Array4<const Real> & stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
+        const Array4<const Real>& stage_xmom = S_stage_data[IntVars::xmom].const_array(mfi);
+        const Array4<const Real>& stage_ymom = S_stage_data[IntVars::ymom].const_array(mfi);
+        const Array4<const Real>& stage_zmom = S_stage_data[IntVars::zmom].const_array(mfi);
         const Array4<const Real> & prim       = S_stage_prim.const_array(mfi);
 
-        const Array4<Real>& old_drho_w     = Delta_rho_w.array(mfi);
-        const Array4<Real>& old_drho       = Delta_rho.array(mfi);
-        const Array4<Real>& old_drho_theta = Delta_rho_theta.array(mfi);
+        const Array4<const Real>& prev_drho_theta = Delta_rho_theta.array(mfi);
+
+        const Array4<const Real>& prev_cons  = S_prev[IntVars::cons].const_array(mfi);
+        const Array4<const Real>& stage_cons = S_stage_data[IntVars::cons].const_array(mfi);
 
         const Array4<const Real>& slow_rhs_cons  = S_slow_rhs[IntVars::cons].const_array(mfi);
         const Array4<const Real>& slow_rhs_rho_w = S_slow_rhs[IntVars::zmom].const_array(mfi);
 
-        const Array4<Real>& cur_zmom       = S_data[IntVars::zmom].array(mfi);
+        const Array4<const Real>& prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
+        const Array4<      Real>&  cur_zmom = S_data[IntVars::zmom].array(mfi);
 
         const Array4<Real>& temp_cur_xmom_arr  = temp_cur_xmom.array(mfi);
         const Array4<Real>& temp_cur_ymom_arr  = temp_cur_ymom.array(mfi);
 
-        const Array4<const Real>& prev_zmom = S_prev[IntVars::zmom].const_array(mfi);
 
         // These store the advection momenta which we will use to update the slow variables
         const Array4<      Real>& avg_zmom = S_scratch[IntVars::zmom].array(mfi);
@@ -368,8 +370,6 @@ void erf_fast_rhs_N (int step, int nrk,
         // *********************************************************************
         // fast_loop_on_shrunk
         // *********************************************************************
-        {
-        BL_PROFILE("fast_loop_on_shrunk");
         //Note we don't act on the bottom or top boundaries of the domain
         ParallelFor(bx_shrunk_in_k, [=] AMREX_GPU_DEVICE (int i, int j, int k)
         {
@@ -392,8 +392,10 @@ void erf_fast_rhs_N (int step, int nrk,
             Real Omega_km1 = prev_zmom(i,j,k-1) - stage_zmom(i,j,k-1);
 
             // line 2 last two terms (order dtau)
-            Real R0_tmp = coeff_P * old_drho_theta(i,j,k) + coeff_Q * old_drho_theta(i,j,k-1)
-                         - halfg * ( old_drho(i,j,k) + old_drho(i,j,k-1) );
+            Real old_drho_k   = prev_cons(i,j,k  ,Rho_comp) - stage_cons(i,j,k  ,Rho_comp);
+            Real old_drho_km1 = prev_cons(i,j,k-1,Rho_comp) - stage_cons(i,j,k-1,Rho_comp);
+            Real R0_tmp = coeff_P * prev_drho_theta(i,j,k) + coeff_Q * prev_drho_theta(i,j,k-1)
+                         - halfg * ( old_drho_k + old_drho_km1 );
 
             // lines 3-5 residuals (order dtau^2) 1.0 <-> beta_2
             Real R1_tmp =  halfg * (-slow_rhs_cons(i,j,k  ,Rho_comp)
@@ -409,94 +411,114 @@ void erf_fast_rhs_N (int step, int nrk,
 
             // line 1
             RHS_a(i,j,k) = Omega_k + dtau * (slow_rhs_rho_w(i,j,k) + R0_tmp + dtau * beta_2 * R1_tmp);
-        });
-        } // end profile
+
+        }); // bx_shrunk_in_k
 
         Box b2d = tbz; // Copy constructor
         b2d.setRange(2,0);
 
-        {
-        BL_PROFILE("fast_rhs_b2d_loop");
-#ifdef AMREX_USE_GPU
         auto const lo = lbound(bx);
         auto const hi = ubound(bx);
+
         ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
         {
           // w at bottom boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
-          RHS_a(i,j,lo.z) = dtau * slow_rhs_rho_w(i,j,lo.z);
+          RHS_a (i,j,lo.z  ) = prev_zmom(i,j,lo.z  ) - stage_zmom(i,j,lo.z)
+                             + dtau * slow_rhs_rho_w(i,j,lo.z);
 
           // w at top boundary of grid is 0 if at domain boundary, otherwise w = w_old + dtau * slow_rhs
-          // TODO TODO: Note that if we ever change this, we will need to include it in avg_zmom at the top
-          RHS_a(i,j,hi.z+1) = dtau * slow_rhs_rho_w(i,j,hi.z+1);
+          RHS_a (i,j,hi.z+1) = prev_zmom(i,j,hi.z+1) - stage_zmom(i,j,hi.z+1)
+                             + dtau * slow_rhs_rho_w(i,j,hi.z+1);
+        }); // b2d
 
-          // w = specified Dirichlet value at k = lo.z
-            soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
-          cur_zmom(i,j,lo.z) = stage_zmom(i,j,lo.z) + soln_a(i,j,lo.z);
+#ifdef AMREX_USE_GPU
+        if (l_implicit_substepping) {
 
-          for (int k = lo.z+1; k <= hi.z+1; k++) {
-              soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
-          }
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              // w = specified Dirichlet value at k = lo.z
+                soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
+              cur_zmom(i,j,lo.z) = stage_zmom(i,j,lo.z) + soln_a(i,j,lo.z);
 
-          cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+              for (int k = lo.z+1; k <= hi.z+1; k++) {
+                  soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+              }
 
-          for (int k = hi.z; k >= lo.z; k--) {
-              soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) *soln_a(i,j,k+1);
-              cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
-          }
-        }); // b2d
+              cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+
+              for (int k = hi.z; k >= lo.z; k--) {
+                  soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) *soln_a(i,j,k+1);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            ParallelFor(b2d, [=] AMREX_GPU_DEVICE (int i, int j, int)
+            {
+              for (int k = lo.z; k <= hi.z+1; k++) {
+                    soln_a(i,j,k) = RHS_a(i,j,k);
+                  cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+              }
+            }); // b2d
+        } // end of explicit substepping
 #else
-        auto const lo = lbound(bx);
-        auto const hi = ubound(bx);
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                // w at bottom boundary of grid is 0 if at domain boundary, otherwise w_old + dtau * slow_rhs
-                RHS_a (i,j,lo.z) = dtau * slow_rhs_rho_w(i,j,lo.z);
-                soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
-            }
-        }
-        // Note that if we ever change this, we will need to include it in avg_zmom at the top
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                RHS_a (i,j,hi.z+1) = dtau * slow_rhs_rho_w(i,j,hi.z+1);
-            }
-        }
-        for (int k = lo.z+1; k <= hi.z+1; ++k) {
+        if (l_implicit_substepping) {
+
             for (int j = lo.y; j <= hi.y; ++j) {
                 AMREX_PRAGMA_SIMD
                 for (int i = lo.x; i <= hi.x; ++i) {
-                    soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+                    soln_a(i,j,lo.z) = RHS_a(i,j,lo.z) * inv_coeffB_a(i,j,lo.z);
                 }
             }
-        }
-        for (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
-            for (int i = lo.x; i <= hi.x; ++i) {
-                cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
+            for (int k = lo.z+1; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) = (RHS_a(i,j,k)-coeffA_a(i,j,k)*soln_a(i,j,k-1)) * inv_coeffB_a(i,j,k);
+                    }
+                }
             }
-        }
-        for (int k = hi.z; k >= lo.z; --k) {
             for (int j = lo.y; j <= hi.y; ++j) {
                 AMREX_PRAGMA_SIMD
                 for (int i = lo.x; i <= hi.x; ++i) {
-                    soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) * soln_a(i,j,k+1);
-                    cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    cur_zmom(i,j,hi.z+1) = stage_zmom(i,j,hi.z+1) + soln_a(i,j,hi.z+1);
                 }
             }
-        }
+            for (int k = hi.z; k >= lo.z; --k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+                        soln_a(i,j,k) -= ( coeffC_a(i,j,k) * inv_coeffB_a(i,j,k) ) * soln_a(i,j,k+1);
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+        } else { // explicit substepping (beta_1 = 1; beta_2 = 0)
+
+            for (int k = lo.z; k <= hi.z+1; ++k) {
+                for (int j = lo.y; j <= hi.y; ++j) {
+                    AMREX_PRAGMA_SIMD
+                    for (int i = lo.x; i <= hi.x; ++i) {
+
+                        soln_a(i,j,k) = RHS_a(i,j,k);
+
+                        cur_zmom(i,j,k) = stage_zmom(i,j,k) + soln_a(i,j,k);
+                    }
+                }
+            }
+
+        } // end of explicit substepping
 #endif
-        } // end profile
 
         // **************************************************************************
         // Define updates in the RHS of rho and (rho theta)
         // **************************************************************************
-        {
-        BL_PROFILE("fast_rho_final_update");
+        const Array4<Real>&  prev_drho_w = Delta_rho_w.array(mfi);
         ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
         {
-            Real zflux_lo = beta_2 * soln_a(i,j,k  ) + beta_1 * old_drho_w(i,j,k  );
-            Real zflux_hi = beta_2 * soln_a(i,j,k+1) + beta_1 * old_drho_w(i,j,k+1);
+            Real zflux_lo = beta_2 * soln_a(i,j,k  ) + beta_1 * prev_drho_w(i,j,k  );
+            Real zflux_hi = beta_2 * soln_a(i,j,k+1) + beta_1 * prev_drho_w(i,j,k+1);
 
             avg_zmom(i,j,k)      += facinv*zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
             (flx_arr[2])(i,j,k,0) =        zflux_lo / (mf_m(i,j,0) * mf_m(i,j,0));
@@ -512,7 +534,6 @@ void erf_fast_rhs_N (int step, int nrk,
             temp_rhs_arr(i,j,k,RhoTheta_comp) += 0.5 * dzi * ( zflux_hi * (prim(i,j,k) + prim(i,j,k+1))
                                                              - zflux_lo * (prim(i,j,k) + prim(i,j,k-1)) );
         });
-        } // end profile
 
         // We only add to the flux registers in the final RK step
         if (l_reflux && nrk == 2) {
@@ -546,15 +567,29 @@ void erf_fast_rhs_N (int step, int nrk,
     {
         const Box& bx = mfi.tilebox();
 
-        int cons_dycore{2};
-        const Array4<Real>& cur_cons = S_data[IntVars::cons].array(mfi);
+        const Array4<      Real>&  cur_cons = S_data[IntVars::cons].array(mfi);
+        const Array4<const Real>& prev_cons = S_prev[IntVars::cons].const_array(mfi);
         auto const& temp_rhs_arr     = temp_rhs.const_array(mfi);
         auto const& slow_rhs_cons    = S_slow_rhs[IntVars::cons].const_array(mfi);
 
-        ParallelFor(bx, cons_dycore, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
-        {
-            cur_cons(i,j,k,n) += dtau * (slow_rhs_cons(i,j,k,n) - temp_rhs_arr(i,j,k,n));
-        });
+        if (step == 0) {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                cur_cons(i,j,k,Rho_comp)      = prev_cons(i,j,k,Rho_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) = prev_cons(i,j,k,RhoTheta_comp) +
+                                                dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } else {
+            ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                //
+                // We didn't need to set cur_cons = prev_cons above because they point to the same data for step > 0
+                //
+                cur_cons(i,j,k,Rho_comp)      += dtau * (slow_rhs_cons(i,j,k,Rho_comp) - temp_rhs_arr(i,j,k,Rho_comp));
+                cur_cons(i,j,k,RhoTheta_comp) += dtau * (slow_rhs_cons(i,j,k,RhoTheta_comp) - temp_rhs_arr(i,j,k,RhoTheta_comp));
+            });
+        } // step = 0
 
         const Array4<Real>& cur_xmom = S_data[IntVars::xmom].array(mfi);
         const Array4<Real>& cur_ymom = S_data[IntVars::ymom].array(mfi);
diff --git a/Source/TimeIntegration/ERF_fast_rhs_T.cpp b/Source/TimeIntegration/ERF_fast_rhs_T.cpp
index a26289176..ed97b8469 100644
--- a/Source/TimeIntegration/ERF_fast_rhs_T.cpp
+++ b/Source/TimeIntegration/ERF_fast_rhs_T.cpp
@@ -6,39 +6,41 @@ using namespace amrex;
 /**
  * Function for computing the fast RHS with fixed terrain
  *
- * @param[in]    step  which fast time step within each Runge-Kutta step
- * @param[in]    nrk   which Runge-Kutta step
- * @param[in]    level level of resolution
- * @param[in]    finest_level finest level of resolution
- * @param[in]    S_slow_rhs slow RHS computed in erf_slow_rhs_pre
- * @param[in]    S_prev previous solution
- * @param[in]    S_stage_data solution            at previous RK stage
- * @param[in]    S_stage_prim primitive variables at previous RK stage
- * @param[in]    pi_stage     Exner function      at previous RK stage
- * @param[in]    fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
- * @param[out]   S_data current solution
- * @param[in]    S_scratch scratch space
- * @param[in]    geom container for geometric information
- * @param[in]    gravity magnitude of gravity
- * @param[in]    Omega component of the momentum normal to the z-coordinate surface
- * @param[in]    z_phys_nd height coordinate at nodes
- * @param[in]    detJ_cc Jacobian of the metric transformation
- * @param[in]    dtau fast time step
- * @param[in]    beta_s  Coefficient which determines how implicit vs explicit the solve is
- * @param[in]    facinv inverse factor for time-averaging the momenta
- * @param[in]    mapfac_m map factor at cell centers
- * @param[in]    mapfac_u map factor at x-faces
- * @param[in]    mapfac_v map factor at y-faces
+ * @param[in   ] step  which fast time step within each Runge-Kutta step
+ * @param[in   ] nrk   which Runge-Kutta step
+ * @param[in   ] level level of resolution
+ * @param[in   ] finest_level finest level of resolution
+ * @param[in   ] S_slow_rhs slow RHS computed in erf_slow_rhs_pre
+ * @param[in   ] S_prev previous solution
+ * @param[in   ] S_stage_data solution            at previous RK stage
+ * @param[in   ] S_stage_prim primitive variables at previous RK stage
+ * @param[in   ] pi_stage     Exner function      at previous RK stage
+ * @param[in   ] fast_coeffs coefficients for the tridiagonal solve used in the fast integrator
+ * @param[  out] S_data current solution
+ * @param[in   ] S_scratch scratch space
+ * @param[in   ] geom container for geometric information
+ * @param[in   ] gravity magnitude of gravity
+ * @param[in   ] Omega component of the momentum normal to the z-coordinate surface
+ * @param[in   ] z_phys_nd height coordinate at nodes
+ * @param[in   ] detJ_cc Jacobian of the metric transformation
+ * @param[in   ] dtau fast time step
+ * @param[in   ] beta_s  Coefficient which determines how implicit vs explicit the solve is
+ * @param[in   ] facinv inverse factor for time-averaging the momenta
+ * @param[in   ] mapfac_m map factor at cell centers
+ * @param[in   ] mapfac_u map factor at x-faces
+ * @param[in   ] mapfac_v map factor at y-faces
  * @param[inout] fr_as_crse YAFluxRegister at level l at level l   / l+1 interface
  * @param[inout] fr_as_fine YAFluxRegister at level l at level l-1 / l   interface
- * @param[in]    l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ] l_use_moisture
+ * @param[in   ] l_reflux should we add fluxes to the FluxRegisters?
+ * @param[in   ] l_implicit_substepping
  */
 
 void erf_fast_rhs_T (int step, int nrk,
                      int level, int finest_level,
                      Vector<MultiFab>& S_slow_rhs,                   // the slow RHS already computed
                      const Vector<MultiFab>& S_prev,                 // if step == 0, this is S_old, else the previous solution
-                     Vector<MultiFab>& S_stage_data,                 // S_bar = S^n, S^* or S^**
+                     Vector<MultiFab>& S_stage_data,                 // S_stage = S^n, S^* or S^**
                      const MultiFab& S_stage_prim,                   // Primitive version of S_stage_data[IntVars::cons]
                      const MultiFab& pi_stage,                       // Exner function evaluated at last stage
                      const MultiFab& fast_coeffs,                    // Coeffs for tridiagonal solve
@@ -57,7 +59,8 @@ void erf_fast_rhs_T (int step, int nrk,
                      YAFluxRegister* fr_as_crse,
                      YAFluxRegister* fr_as_fine,
                      bool l_use_moisture,
-                     bool l_reflux)
+                     bool l_reflux,
+                     bool /*l_implicit_substepping*/)
 {
     BL_PROFILE_REGION("erf_fast_rhs_T()");
 
diff --git a/Source/TimeIntegration/ERF_slow_rhs_pre.cpp b/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
index 7cb16958b..51890a8cd 100644
--- a/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
+++ b/Source/TimeIntegration/ERF_slow_rhs_pre.cpp
@@ -165,7 +165,6 @@ void erf_slow_rhs_pre (int level, int finest_level,
 
     // We cannot use anelastic with terrain or with moisture
     AMREX_ALWAYS_ASSERT(!l_use_terrain  || !l_anelastic);
-    AMREX_ALWAYS_ASSERT(!l_use_moisture || !l_anelastic);
 
     const Box& domain = geom.Domain();
     const int domhi_z = domain.bigEnd(2);
@@ -373,8 +372,9 @@ void erf_slow_rhs_pre (int level, int finest_level,
             // Now create Omega with momentum (not velocity) with z_t subtracted if moving terrain
             if (l_use_terrain) {
 
-                Box gbxo_lo = gbxo; gbxo_lo.setBig(2,0);
-                if (gbxo_lo.smallEnd(2) <= 0) {
+                Box gbxo_lo = gbxo; gbxo_lo.setBig(2,domain.smallEnd(2));
+                int lo_z_face = domain.smallEnd(2);
+                if (gbxo_lo.smallEnd(2) <= lo_z_face) {
                     ParallelFor(gbxo_lo, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept {
                         omega_arr(i,j,k) = 0.;
                     });
diff --git a/Source/Utils/ERF_InteriorGhostCells.cpp b/Source/Utils/ERF_InteriorGhostCells.cpp
index 1cdfafcc7..76a57beea 100644
--- a/Source/Utils/ERF_InteriorGhostCells.cpp
+++ b/Source/Utils/ERF_InteriorGhostCells.cpp
@@ -89,7 +89,6 @@ compute_interior_ghost_bxs_xy (const Box& bx,
 /**
  * Compute the RHS in the relaxation zone
  *
- * @param[in] init_type initialization method for this simulation
  * @param[in] bdy_time_interval time interval between boundary condition time stamps
  * @param[in] time    current time
  * @param[in] delta_t timestep
@@ -105,8 +104,7 @@ compute_interior_ghost_bxs_xy (const Box& bx,
  * @param[in] start_bdy_time time of the first boundary data read in
  */
 void
-realbdy_compute_interior_ghost_rhs (const std::string& /*init_type*/,
-                                    const Real& bdy_time_interval,
+realbdy_compute_interior_ghost_rhs (const Real& bdy_time_interval,
                                     const Real& start_bdy_time,
                                     const Real& time,
                                     const Real& delta_t,
diff --git a/Source/Utils/ERF_PoissonSolve.cpp b/Source/Utils/ERF_PoissonSolve.cpp
index 283f73025..d5a82fbc4 100644
--- a/Source/Utils/ERF_PoissonSolve.cpp
+++ b/Source/Utils/ERF_PoissonSolve.cpp
@@ -119,10 +119,57 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
 
     Real start_step = static_cast<Real>(ParallelDescriptor::second());
 
-#ifdef ERF_USE_HEFFTE
-    if (use_heffte) {
-        solve_with_heffte(lev, rhs[0], phi[0], fluxes[0]);
-    } else
+#ifdef ERF_USE_FFT
+    if (use_fft) {
+        AMREX_ALWAYS_ASSERT(lev == 0);
+        if (!m_poisson) {
+            m_poisson = std::make_unique<FFT::PoissonHybrid<MultiFab>>(Geom(0));
+        }
+        m_poisson->solve(phi[lev], rhs[lev]);
+
+        phi[lev].FillBoundary(geom[lev].periodicity());
+
+        auto dxInv = geom[lev].InvCellSizeArray();
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(phi[lev], TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        Array4<Real const> const&  p_arr  = phi[lev].array(mfi);
+
+        Box const& xbx = mfi.nodaltilebox(0);
+        const Real dx_inv = dxInv[0];
+        Array4<Real> const& fx_arr  = fluxes[lev][0].array(mfi);
+        ParallelFor(xbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            fx_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i-1,j,k)) * dx_inv;
+        });
+
+        Box const& ybx = mfi.nodaltilebox(1);
+        const Real dy_inv = dxInv[1];
+        Array4<Real> const& fy_arr  = fluxes[lev][1].array(mfi);
+        ParallelFor(ybx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            fy_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i,j-1,k)) * dy_inv;
+        });
+
+        auto const dom_lo = lbound(geom[lev].Domain());
+        auto const dom_hi = ubound(geom[lev].Domain());
+
+        Box const& zbx = mfi.nodaltilebox(2);
+        const Real dz_inv = dxInv[2];
+        Array4<Real> const& fz_arr  = fluxes[lev][2].array(mfi);
+        ParallelFor(zbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+        {
+            if (k == dom_lo.z || k == dom_hi.z+1) {
+                fz_arr(i,j,k) = 0.0;
+            } else {
+                fz_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i,j,k-1)) * dz_inv;
+            }
+        });
+    } // mfi
+     } else
 #endif
     {
         // Initialize phi to 0
@@ -142,11 +189,6 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
         mlmg.getFluxes(GetVecOfArrOfPtrs(fluxes));
     }
 
-    Real end_step = static_cast<Real>(ParallelDescriptor::second());
-    if (mg_verbose > 0) {
-        amrex::Print() << "Time in solve " << end_step - start_step << std::endl;
-    }
-
     // Subtract dt grad(phi) from the momenta
     MultiFab::Add(mom_mf[IntVars::xmom],fluxes[0][0],0,0,1,0);
     MultiFab::Add(mom_mf[IntVars::ymom],fluxes[0][1],0,0,1,0);
@@ -156,6 +198,11 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
     MultiFab::Saxpy(pmf, 1.0/l_dt, phi[0],0,0,1,0);
     pmf.FillBoundary(geom[lev].periodicity());
 
+    Real end_step = static_cast<Real>(ParallelDescriptor::second());
+    if (mg_verbose > 0) {
+        amrex::Print() << "Time in solve " << end_step - start_step << std::endl;
+    }
+
 #ifdef _OPENMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
diff --git a/Source/Utils/ERF_Utils.H b/Source/Utils/ERF_Utils.H
index 0362866b9..f713347f2 100644
--- a/Source/Utils/ERF_Utils.H
+++ b/Source/Utils/ERF_Utils.H
@@ -88,8 +88,7 @@ void compute_interior_ghost_bxs_xy (const amrex::Box& bx,
 /*
  * Compute relaxation region RHS with wrfbdy
  */
-void realbdy_compute_interior_ghost_rhs (const std::string& init_type,
-                                         const amrex::Real& bdy_time_interval,
+void realbdy_compute_interior_ghost_rhs (const amrex::Real& bdy_time_interval,
                                          const amrex::Real& start_bdy_time,
                                          const amrex::Real& time,
                                          const amrex::Real& delta_t,
diff --git a/Source/Utils/ERF_VelocityToMomentum.cpp b/Source/Utils/ERF_VelocityToMomentum.cpp
index 271356276..213572e41 100644
--- a/Source/Utils/ERF_VelocityToMomentum.cpp
+++ b/Source/Utils/ERF_VelocityToMomentum.cpp
@@ -54,19 +54,9 @@ void VelocityToMomentum (const MultiFab& xvel_in,
         tby = mfi.tilebox(IntVect(0,1,0),yvel_ngrow);
         tbz = mfi.tilebox(IntVect(0,0,1),zvel_ngrow);
 
-#if 0
-        if (l_use_ndiff) {
-            tbx = mfi.tilebox(IntVect(1,0,0),xvel_ngrow);
-            tby = mfi.tilebox(IntVect(0,1,0),yvel_ngrow);
-            tbz = mfi.tilebox(IntVect(0,0,1),zvel_ngrow);
-        } else {
-            tbx = mfi.tilebox(IntVect(1,0,0),IntVect(1,1,1));
-            if (tbx.smallEnd(2) < 0) tbx.setSmall(2,0);
-            tby = mfi.tilebox(IntVect(0,1,0),IntVect(1,1,1));
-            if (tby.smallEnd(2) < 0) tby.setSmall(2,0);
-            tbz = mfi.tilebox(IntVect(0,0,1),IntVect(1,1,0));
-        }
-#endif
+        // Don't actually try to fill w above or below the domain
+        if (tbz.smallEnd(2) < domain.smallEnd(2)) tbz.setSmall(2,domain.smallEnd(2));
+        if (tbz.bigEnd(2)   > domain.bigEnd(2)+1) tbz.setBig(2,domain.bigEnd(2)+1);
 
         // Conserved/state variables on cell centers -- we use this for density
         const Array4<const Real>& dens_arr = density.array(mfi);
@@ -81,6 +71,8 @@ void VelocityToMomentum (const MultiFab& xvel_in,
         const Array4<Real const>& vely = yvel_in.const_array(mfi);
         const Array4<Real const>& velz = zvel_in.const_array(mfi);
 
+        // ********************************************************************************************
+
         ParallelFor(tbx, tby, tbz,
         [=] AMREX_GPU_DEVICE (int i, int j, int k) {
             momx(i,j,k) = velx(i,j,k) * 0.5 * (dens_arr(i,j,k,Rho_comp) + dens_arr(i-1,j,k,Rho_comp));
@@ -92,6 +84,8 @@ void VelocityToMomentum (const MultiFab& xvel_in,
             momz(i,j,k) = velz(i,j,k) * 0.5 * (dens_arr(i,j,k,Rho_comp) + dens_arr(i,j,k-1,Rho_comp));
         });
 
+        // ********************************************************************************************
+
         if ( (bx.smallEnd(0) == domain.smallEnd(0)) &&
              (bc_ptr_h[BCVars::cons_bc].lo(0) == ERFBCType::ext_dir) ) {
             ParallelFor(makeSlab(tbx,0,domain.smallEnd(0)), [=] AMREX_GPU_DEVICE (int i, int j, int k) {
diff --git a/Source/Utils/ERF_solve_with_heffte.cpp b/Source/Utils/ERF_solve_with_heffte.cpp
deleted file mode 100644
index e95025ff1..000000000
--- a/Source/Utils/ERF_solve_with_heffte.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-#include "ERF.H"
-#include <AMReX_MLMG.H>
-#include <AMReX_MLPoisson.H>
-#include "ERF_Utils.H"
-#ifdef ERF_USE_HEFFTE
-#include "heffte.h"
-#endif
-
-#ifdef ERF_USE_HEFFTE
-
-using namespace amrex;
-
-void ERF::solve_with_heffte (int lev, MultiFab& rhs, MultiFab& phi,
-                             Array<MultiFab,AMREX_SPACEDIM>& fluxes)
-{
-    BoxArray ba(rhs.boxArray());
-    DistributionMapping dm(rhs.DistributionMap());
-
-    // The heffte solve uses a solution array with no ghost cells
-    MultiFab soln(ba,dm,1,0);
-
-    // Determine the domain length in each direction
-    Real L_x = geom[lev].ProbHi(0) - geom[lev].ProbLo(0);
-    Real L_y = geom[lev].ProbHi(1) - geom[lev].ProbLo(1);
-    Real L_z = geom[lev].ProbHi(2) - geom[lev].ProbLo(2);
-
-    Box domain = geom[lev].Domain();
-    auto const& domlo = lbound(domain);
-    auto const& domhi = ubound(domain);
-
-    int n_cell_x = domain.length(0);
-    int n_cell_y = domain.length(1);
-    int n_cell_z = domain.length(2);
-
-    auto dx    = geom[lev].CellSize();
-    auto dxinv = geom[lev].InvCellSize();
-
-    // Since there is 1 MPI rank per box, here each MPI rank obtains its local box and the associated boxid
-    Box local_box;
-    int local_boxid;
-    {
-        for (int i = 0; i < ba.size(); ++i) {
-            Box b = ba[i];
-            // each MPI rank has its own local_box Box and local_boxid ID
-            if (ParallelDescriptor::MyProc() == dm[i]) {
-                local_box = b;
-                local_boxid = i;
-            }
-        }
-    }
-
-    // Now each MPI rank works on its own box
-    // Ror real->complex fft's, the fft is stored in an (nx/2+1) x ny x nz dataset
-
-    // start by coarsening each box by 2 in the x-direction
-    Box c_local_box = amrex::coarsen(local_box, IntVect(AMREX_D_DECL(2,1,1)));
-
-    // If the coarsened box's high-x index is even, we shrink the size in 1 in x
-    // this avoids overlap between coarsened boxes
-    if (c_local_box.bigEnd(0) * 2 == local_box.bigEnd(0)) {
-        c_local_box.setBig(0,c_local_box.bigEnd(0)-1);
-    }
-    // For any boxes that touch the hi-x domain we increase the size of boxes by 1 in x
-    // This makes the overall fft dataset have size (Nx/2+1 x Ny x Nz)
-    if (local_box.bigEnd(0) == geom[lev].Domain().bigEnd(0)) {
-        c_local_box.growHi(0,1);
-    }
-
-    // Each MPI rank gets storage for its piece of the fft
-    BaseFab<GpuComplex<Real> > spectral_field(c_local_box, 1, The_Device_Arena());
-
-    // Create real->complex fft objects with the appropriate backend and data about
-    // the domain size and its local box size
-
-    bool do_2d_solves = false;
-
-    // ********************************************************************************************
-    // ********************************************************************************************
-    // ********************************************************************************************
-
-    // ********************************************************************************************
-    // NOTE: THIS IS A WIP - IT DOES NOT WORK YET
-    // ********************************************************************************************
-    if (do_2d_solves) {
-
-#ifdef AMREX_USE_CUDA
-        heffte::fft2d_r2c<heffte::backend::cufft> fft
-#elif AMREX_USE_HIP
-        heffte::fft2d_r2c<heffte::backend::rocfft> fft
-#else
-        heffte::fft2d_r2c<heffte::backend::fftw> fft
-#endif
-        ({{local_box.smallEnd(0),local_box.smallEnd(1),0},
-          {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,0}},
-         {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),0},
-          {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,0}},
-         0, ParallelDescriptor::Communicator());
-
-        using heffte_complex = typename heffte::fft_output<Real>::type;
-        heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
-
-        // ********************************************************************************************
-
-        for (int k = domlo.z; k <= domhi.z; k++) {
-            int offset = k * (n_cell_x*n_cell_y);
-            fft.forward(rhs[local_boxid].dataPtr(offset), spectral_data);
-        }
-
-        // ********************************************************************************************
-
-        // Now we take the standard FFT and scale it by 1/k^2
-        Array4< GpuComplex<Real> > spectral = spectral_field.array();
-
-        ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
-        {
-            Real a = 2.*M_PI*i / L_x;
-            Real b = 2.*M_PI*j / L_y;
-            Real c = 2.*M_PI*k / L_z;
-
-            // the values in the upper-half of the spectral array in y and z are here interpreted as negative wavenumbers
-            if (j >= n_cell_y/2) b = 2.*M_PI*(n_cell_y-j) / L_y;
-            if (k >= n_cell_z/2) c = 2.*M_PI*(n_cell_z-k) / L_z;
-
-            Real k2 =  2.0*(std::cos(a*dx[0])-1.)*(dxinv[0]*dxinv[0]) +
-                       2.0*(std::cos(b*dx[1])-1.)*(dxinv[1]*dxinv[1]) ;
-            if (k2 != 0.) {
-                spectral(i,j,k) /= k2;
-            } else {
-                spectral(i,j,k) *= 0.; // interpretation here is that the average value of the solution is zero
-            }
-        });
-
-        // ********************************************************************************************
-
-        for (int k = domlo.z; k <= domhi.z; k++) {
-            int offset = k * (n_cell_x*n_cell_y);
-            fft.backward(spectral_data, soln[local_boxid].dataPtr(offset));
-        }
-
-        // ********************************************************************************************
-
-    } else {
-
-#ifdef AMREX_USE_CUDA
-        heffte::fft3d_r2c<heffte::backend::cufft> fft
-#elif AMREX_USE_HIP
-        heffte::fft3d_r2c<heffte::backend::rocfft> fft
-#else
-        heffte::fft3d_r2c<heffte::backend::fftw> fft
-#endif
-        ({{local_box.smallEnd(0),local_box.smallEnd(1),local_box.smallEnd(2)},
-          {local_box.bigEnd(0)  ,local_box.bigEnd(1)  ,local_box.bigEnd(2)}},
-         {{c_local_box.smallEnd(0),c_local_box.smallEnd(1),c_local_box.smallEnd(2)},
-          {c_local_box.bigEnd(0)  ,c_local_box.bigEnd(1)  ,c_local_box.bigEnd(2)}},
-         0, ParallelDescriptor::Communicator());
-
-        using heffte_complex = typename heffte::fft_output<Real>::type;
-        heffte_complex* spectral_data = (heffte_complex*) spectral_field.dataPtr();
-
-        // ********************************************************************************************
-
-        fft.forward(rhs[local_boxid].dataPtr(), spectral_data);
-
-        // ********************************************************************************************
-
-        // Now we take the standard FFT and scale it by 1/k^2
-        Array4< GpuComplex<Real> > spectral = spectral_field.array();
-
-        ParallelFor(c_local_box, [=] AMREX_GPU_DEVICE(int i, int j, int k)
-        {
-            Real a = 2.*M_PI*i / L_x;
-            Real b = 2.*M_PI*j / L_y;
-            Real c = 2.*M_PI*k / L_z;
-
-            // the values in the upper-half of the spectral array in y and z are here interpreted as negative wavenumbers
-            if (j >= n_cell_y/2) b = 2.*M_PI*(n_cell_y-j) / L_y;
-            if (k >= n_cell_z/2) c = 2.*M_PI*(n_cell_z-k) / L_z;
-
-            Real k2 =  2.0*(std::cos(a*dx[0])-1.)*(dxinv[0]*dxinv[0]) +
-                       2.0*(std::cos(b*dx[1])-1.)*(dxinv[1]*dxinv[1]) +
-                       2.0*(std::cos(c*dx[2])-1.)*(dxinv[2]*dxinv[2]);
-            if (k2 != 0.) {
-                spectral(i,j,k) /= k2;
-            } else {
-                spectral(i,j,k) *= 0.; // interpretation here is that the average value of the solution is zero
-            }
-        });
-
-        // ********************************************************************************************
-
-        fft.backward(spectral_data, soln[local_boxid].dataPtr());
-
-        // ********************************************************************************************
-
-    } // 3d solve
-
-    // ********************************************************************************************
-    // ********************************************************************************************
-    // ********************************************************************************************
-
-    // Scale by 1/npts (both forward and inverse need sqrt(npts) scaling so I am doing it all here)
-    Real npts = static_cast<Real>(ba.numPts());
-    soln.mult(1./npts);
-
-    // ********************************************************************************************
-
-    phi.copy(soln);
-    phi.FillBoundary(geom[lev].periodicity());
-
-#ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-    for (MFIter mfi(soln, TilingIfNotGPU()); mfi.isValid(); ++mfi)
-    {
-        Array4<Real const> const&  p_arr  = phi.array(mfi);
-
-        Box const& xbx = mfi.nodaltilebox(0);
-        const Real dx_inv = dxinv[0];
-        Array4<Real      > const& fx_arr  = fluxes[0].array(mfi);
-        ParallelFor(xbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-        {
-            fx_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i-1,j,k)) * dx_inv;
-        });
-
-        Box const& ybx = mfi.nodaltilebox(1);
-        const Real dy_inv = dxinv[1];
-        Array4<Real      > const& fy_arr  = fluxes[1].array(mfi);
-        ParallelFor(ybx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-        {
-            fy_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i,j-1,k)) * dy_inv;
-        });
-
-        Box const& zbx = mfi.nodaltilebox(2);
-        const Real dz_inv = dxinv[2];
-        Array4<Real      > const& fz_arr  = fluxes[2].array(mfi);
-        ParallelFor(zbx, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-        {
-            fz_arr(i,j,k) = -(p_arr(i,j,k) - p_arr(i,j-1,k)) * dz_inv;
-        });
-    } // mfi
-}
-
-#endif
diff --git a/Source/Utils/Make.package b/Source/Utils/Make.package
index e6a103db0..f757c99f9 100644
--- a/Source/Utils/Make.package
+++ b/Source/Utils/Make.package
@@ -25,6 +25,3 @@ CEXE_sources += ERF_Time_Avg_Vel.cpp
 
 CEXE_sources += ERF_PoissonSolve.cpp
 CEXE_sources += ERF_PoissonSolve_tb.cpp
-ifeq ($(USE_HEFFTE),TRUE)
-CEXE_sources += ERF_solve_with_heffte.cpp
-endif
diff --git a/Source/WindFarmParametrization/ERF_InitWindFarm.cpp b/Source/WindFarmParametrization/ERF_InitWindFarm.cpp
index 32fae15ce..99a6f2f20 100644
--- a/Source/WindFarmParametrization/ERF_InitWindFarm.cpp
+++ b/Source/WindFarmParametrization/ERF_InitWindFarm.cpp
@@ -3,6 +3,8 @@
  */
 
 #include <ERF_WindFarm.H>
+#include <filesystem>
+#include <dirent.h>   // For POSIX directory handling
 
 using namespace amrex;
 
@@ -123,6 +125,14 @@ WindFarm::init_windfarm_lat_lon (const std::string windfarm_loc_table,
         xloc[it] = xloc[it] - xloc_min + windfarm_x_shift;
         yloc[it] = yloc[it] - yloc_min + windfarm_y_shift;
     }
+
+    FILE* file_xy_loc;
+    file_xy_loc = fopen("file_xy_loc_KingPlains.txt","w");
+
+    for(int it = 0;it<xloc.size(); it++){
+        fprintf(file_xy_loc,"%0.15g %0.15g %0.15g\n", xloc[it], yloc[it], 89.0);
+    }
+    fclose(file_xy_loc);
 }
 
 void
@@ -199,6 +209,8 @@ void
 WindFarm::read_windfarm_blade_table(const std::string windfarm_blade_table)
 {
     std::ifstream filename(windfarm_blade_table);
+    std::string line;
+    Real temp, var1, var2, var3;
     if (!filename.is_open()) {
         Error("You are using a generalized actuator disk model based on blade element theory. This needs info of blades."
                       " An entry erf.windfarm_blade_table is needed. Either the entry is missing or the file specified"
@@ -206,7 +218,143 @@ WindFarm::read_windfarm_blade_table(const std::string windfarm_blade_table)
     }
     else {
         Print() << "Reading in wind farm blade table: " << windfarm_blade_table << "\n";
+
+        // First 6 lines are comments
+
+        for (int i = 0; i < 6; ++i) {
+            if (std::getline(filename, line)) {  // Read one line into the array
+            }
+        }
+
+        while(filename >> var1 >> temp >> temp >> temp >> var2 >> var3 >> temp) {
+            bld_rad_loc.push_back(var1);
+            bld_twist.push_back(var2);
+            bld_chord.push_back(var3);
+            //int idx = bld_rad_loc.size()-1;
+            //printf("Values are = %0.15g %0.15g %0.15g\n", bld_rad_loc[idx], bld_twist[idx], bld_chord[idx]);
+        }
+        set_blade_spec(bld_rad_loc, bld_twist, bld_chord);
+        n_bld_sections = bld_rad_loc.size();
+    }
+}
+
+void
+WindFarm::read_windfarm_spec_table_extra(const std::string windfarm_spec_table_extra)
+{
+    // Open the file
+    std::ifstream file(windfarm_spec_table_extra);
+
+    // Check if file opened successfully
+    if (!file.is_open()) {
+        Abort("Error: You are using generalized wind farms option. This requires an input file erf.windfarm_spec_table_extra."
+              " Either this entry is missing in the inputs or the file specified -" + windfarm_spec_table_extra + " does"
+              " not exist. Exiting...");
+    } else {
+        printf("Reading in windfarm_spec_table_extra %s", windfarm_spec_table_extra.c_str());
+    }
+
+    // Ignore the first line (header)
+    std::string header;
+    std::getline(file, header);
+
+    // Variables to hold each row's values
+    double V, Cp, Ct, rpm, pitch, temp;
+
+    // Read the file row by row
+    while (file >> V) {
+        char comma;  // To ignore the commas
+        file >> comma >> Cp >> comma >> Ct >> comma >> temp >> comma >> temp >> comma
+             >> temp >> comma >> rpm >> comma >> pitch >> comma >> temp;
+
+        velocity.push_back(V);
+        C_P.push_back(Cp);
+        C_T.push_back(Ct);
+        rotor_RPM.push_back(rpm);
+        blade_pitch.push_back(pitch);
+    }
+
+    set_turb_spec_extra(velocity, C_P, C_T, rotor_RPM, blade_pitch);
+}
+
+
+void
+WindFarm::read_windfarm_airfoil_tables(const std::string windfarm_airfoil_tables,
+                                       const std::string windfarm_blade_table)
+{
+    DIR* dir;
+    struct dirent* entry;
+    std::vector<std::string> files;
+
+    // Check if directory exists
+    if ((dir = opendir(windfarm_airfoil_tables.c_str())) == nullptr) {
+       Abort("You are using a generalized actuator disk model based on blade element theory. This needs info of airfoil"
+             " cross sections over the span of the blade. There needs to be an entry erf.airfoil_tables which is the directory that"
+             " contains the angle of attack, Cl, Cd data for each airfoil cross-section. Either the entry is missing or the directory specified"
+             " in the entry - " + windfarm_airfoil_tables + " is missing. Exiting...");
+    }
+
+    // Loop through directory entries and collect filenames
+    while ((entry = readdir(dir)) != nullptr) {
+        // Skip special directory entries "." and ".."
+        if (std::string(entry->d_name) == "." || std::string(entry->d_name) == "..") {
+            continue;
+        }
+        files.emplace_back(windfarm_airfoil_tables + "/" + entry->d_name);  // Add file path to vector
     }
+
+    // Close the directory
+    closedir(dir);
+
+    if (files.empty()) {
+        Abort("It seems the directory containing the info of airfoil cross sections of the blades - " + windfarm_airfoil_tables +
+              " is empty. Exiting...");
+    }
+
+    if(files.size() != static_cast<long double>(n_bld_sections)) {
+        printf("There are %d airfoil sections in the last column of %s. But the number"
+               " of files in %s is only %ld.\n", n_bld_sections, windfarm_blade_table.c_str(),
+                windfarm_airfoil_tables.c_str(), files.size());
+        Abort("The number of blade sections from " + windfarm_blade_table + " should match the number of"
+              " files in " + windfarm_airfoil_tables + ". Exiting...");
+    }
+
+    // Sort filenames in lexicographical (alphabetical) order
+    std::sort(files.begin(), files.end());
+
+    // Process each file
+    int count = 0;
+    bld_airfoil_aoa.resize(n_bld_sections);
+    bld_airfoil_Cl.resize(n_bld_sections);
+    bld_airfoil_Cd.resize(n_bld_sections);
+    for (const auto& filePath : files) {
+        std::ifstream filename(filePath.c_str());
+
+        if (!filename.is_open()) {
+            std::cerr << "Failed to open file: " << filePath << std::endl;
+            continue;  // Move on to the next file
+        }
+
+           std::cout << "Reading file: " << filePath << std::endl;
+
+        std::string line;
+        for (int i = 0; i < 54; ++i) {
+            if (std::getline(filename, line)) {  // Read one line into the array
+            }
+        }
+
+        Real var1, var2, var3, temp;
+
+        while(filename >> var1 >> var2 >> var3 >> temp) {
+            bld_airfoil_aoa[count].push_back(var1);
+            bld_airfoil_Cl[count].push_back(var2);
+            bld_airfoil_Cd[count].push_back(var3);
+            //int idx = bld_airfoil_aoa.size()-1;
+            //printf("Values are = %0.15g %0.15g %0.15g\n", bld_airfoil_aoa[idx], bld_airfoil_Cl[idx], bld_airfoil_Cd[idx]);
+        }
+        count++;
+    }
+
+    set_blade_airfoil_spec(bld_airfoil_aoa, bld_airfoil_Cl, bld_airfoil_Cd);
 }
 
 void
diff --git a/Source/WindFarmParametrization/ERF_WindFarm.H b/Source/WindFarmParametrization/ERF_WindFarm.H
index ee7742de9..100a81f0a 100644
--- a/Source/WindFarmParametrization/ERF_WindFarm.H
+++ b/Source/WindFarmParametrization/ERF_WindFarm.H
@@ -65,7 +65,10 @@ public:
 
     void read_windfarm_blade_table(const std::string windfarm_blade_table);
 
-    void read_windfarm_airofil_tables(const std::string windfarm_airfoils_tables);
+    void read_windfarm_airfoil_tables(const std::string windfarm_airfoil_tables,
+                                      const std::string windfarm_blade_table);
+
+    void read_windfarm_spec_table_extra(const std::string windfarm_spec_table_extra);
 
     void fill_Nturb_multifab(const amrex::Geometry& geom,
                              amrex::MultiFab& mf_Nturb);
@@ -108,9 +111,32 @@ public:
         m_windfarm_model[0]->set_turb_loc(a_xloc, a_yloc);
     }
 
-    void set_turb_disk_angle (const amrex::Real& turb_disk_angle) override
+    void set_turb_disk_angle (const amrex::Real& a_turb_disk_angle) override
+    {
+        m_windfarm_model[0]->set_turb_disk_angle(a_turb_disk_angle);
+    }
+
+    void set_blade_spec (const amrex::Vector<amrex::Real>& a_bld_rad_loc,
+                         const amrex::Vector<amrex::Real>& a_bld_twist,
+                         const amrex::Vector<amrex::Real>& a_bld_chord) override
+    {
+        m_windfarm_model[0]->set_blade_spec(a_bld_rad_loc, a_bld_twist, a_bld_chord);
+    }
+
+    void set_blade_airfoil_spec (const amrex::Vector<amrex::Vector<amrex::Real>>& a_bld_airfoil_aoa,
+                                 const amrex::Vector<amrex::Vector<amrex::Real>>& a_bld_airfoil_Cl,
+                                 const amrex::Vector<amrex::Vector<amrex::Real>>& a_bld_airfoil_Cd) override
+    {
+        m_windfarm_model[0]->set_blade_airfoil_spec(a_bld_airfoil_aoa, a_bld_airfoil_Cl, a_bld_airfoil_Cd);
+    }
+
+    void set_turb_spec_extra (const amrex::Vector<amrex::Real>& a_velocity,
+                              const amrex::Vector<amrex::Real>& a_C_P,
+                              const amrex::Vector<amrex::Real>& a_C_T,
+                              const amrex::Vector<amrex::Real>& a_rotor_RPM,
+                              const amrex::Vector<amrex::Real>& a_blade_pitch) override
     {
-        m_windfarm_model[0]->set_turb_disk_angle(turb_disk_angle);
+        m_windfarm_model[0]->set_turb_spec_extra(a_velocity, a_C_P, a_C_T, a_rotor_RPM, a_blade_pitch);
     }
 
 protected:
@@ -119,6 +145,10 @@ protected:
     amrex::Real my_turb_disk_angle;
     amrex::Real hub_height, rotor_rad, thrust_coeff_standing, nominal_power;
     amrex::Vector<amrex::Real> wind_speed, thrust_coeff, power;
+    amrex::Vector<amrex::Real> bld_rad_loc, bld_twist, bld_chord;
+    amrex::Vector<amrex::Vector<amrex::Real>> bld_airfoil_aoa, bld_airfoil_Cl, bld_airfoil_Cd;
+    int n_bld_sections;
+    amrex::Vector<amrex::Real> velocity, C_P, C_T, rotor_RPM, blade_pitch;
 
    /*! \brief Create and set the specified windfarm model */
     template<class NewWindFarmModel>
diff --git a/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_AdvanceGeneralAD.cpp b/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_AdvanceGeneralAD.cpp
index 96243d29b..42f9d4be3 100644
--- a/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_AdvanceGeneralAD.cpp
+++ b/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_AdvanceGeneralAD.cpp
@@ -1,6 +1,7 @@
 #include <ERF_GeneralAD.H>
 #include <ERF_IndexDefines.H>
 #include <ERF_Interpolation_1D.H>
+#include <ERF_Constants.H>
 
 using namespace amrex;
 
@@ -18,14 +19,17 @@ GeneralAD::advance (const Geometry& geom,
     AMREX_ALWAYS_ASSERT(W_old.nComp() > 0);
     AMREX_ALWAYS_ASSERT(mf_Nturb.nComp() > 0);
     AMREX_ALWAYS_ASSERT(mf_vars_generalAD.nComp() > 0);
+    compute_freestream_velocity(cons_in, U_old, V_old, mf_SMark);
     source_terms_cellcentered(geom, cons_in, mf_SMark, mf_vars_generalAD);
-    update(dt_advance, cons_in, U_old, V_old, mf_vars_generalAD);
+    update(dt_advance, cons_in, U_old, V_old, W_old, mf_vars_generalAD);
 }
 
 void
 GeneralAD::update (const Real& dt_advance,
                   MultiFab& cons_in,
-                  MultiFab& U_old, MultiFab& V_old,
+                  MultiFab& U_old,
+                  MultiFab& V_old,
+                  MultiFab& W_old,
                   const MultiFab& mf_vars_generalAD)
 {
 
@@ -33,12 +37,14 @@ GeneralAD::update (const Real& dt_advance,
 
         Box tbx = mfi.nodaltilebox(0);
         Box tby = mfi.nodaltilebox(1);
+        Box tbz = mfi.nodaltilebox(2);
 
         auto generalAD_array = mf_vars_generalAD.array(mfi);
         auto u_vel       = U_old.array(mfi);
         auto v_vel       = V_old.array(mfi);
+        auto w_vel       = W_old.array(mfi);
 
-        ParallelFor(tbx, tby,
+        ParallelFor(tbx, tby, tbz,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             u_vel(i,j,k) = u_vel(i,j,k) + (generalAD_array(i-1,j,k,0) + generalAD_array(i,j,k,0))/2.0*dt_advance;
@@ -46,10 +52,235 @@ GeneralAD::update (const Real& dt_advance,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             v_vel(i,j,k) = v_vel(i,j,k) + (generalAD_array(i,j-1,k,1) + generalAD_array(i,j,k,1))/2.0*dt_advance;
+        },
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            w_vel(i,j,k) = w_vel(i,j,k) + (generalAD_array(i,j,k-1,2) + generalAD_array(i,j,k,2))/2.0*dt_advance;
         });
+
     }
 }
 
+void GeneralAD::compute_freestream_velocity(const MultiFab& cons_in,
+                                           const MultiFab& U_old,
+                                           const MultiFab& V_old,
+                                           const MultiFab& mf_SMark)
+{
+     get_turb_loc(xloc, yloc);
+     freestream_velocity.clear();
+     freestream_phi.clear();
+     disk_cell_count.clear();
+     freestream_velocity.resize(xloc.size(),0.0);
+     freestream_phi.resize(xloc.size(),0.0);
+     disk_cell_count.resize(xloc.size(),0.0);
+
+     Gpu::DeviceVector<Real> d_freestream_velocity(xloc.size());
+     Gpu::DeviceVector<Real> d_freestream_phi(yloc.size());
+     Gpu::DeviceVector<Real> d_disk_cell_count(yloc.size());
+     Gpu::copy(Gpu::hostToDevice, freestream_velocity.begin(), freestream_velocity.end(), d_freestream_velocity.begin());
+     Gpu::copy(Gpu::hostToDevice, freestream_phi.begin(), freestream_phi.end(), d_freestream_phi.begin());
+     Gpu::copy(Gpu::hostToDevice, disk_cell_count.begin(), disk_cell_count.end(), d_disk_cell_count.begin());
+
+     Real* d_freestream_velocity_ptr = d_freestream_velocity.data();
+     Real* d_freestream_phi_ptr = d_freestream_phi.data();
+     Real* d_disk_cell_count_ptr     = d_disk_cell_count.data();
+
+
+     for ( MFIter mfi(cons_in,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+
+        auto SMark_array    = mf_SMark.array(mfi);
+        auto u_vel          = U_old.array(mfi);
+        auto v_vel          = V_old.array(mfi);
+        Box tbx = mfi.nodaltilebox(0);
+
+        ParallelFor(tbx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+
+            if(SMark_array(i,j,k,0) != -1.0) {
+                int turb_index = static_cast<int>(SMark_array(i,j,k,0));
+                Real phi = std::atan2(v_vel(i,j,k),u_vel(i,j,k)); // Wind direction w.r.t the x-dreiction
+                Gpu::Atomic::Add(&d_freestream_velocity_ptr[turb_index],std::pow(u_vel(i,j,k)*u_vel(i,j,k) + v_vel(i,j,k)*v_vel(i,j,k),0.5));
+                Gpu::Atomic::Add(&d_disk_cell_count_ptr[turb_index],1.0);
+                Gpu::Atomic::Add(&d_freestream_phi_ptr[turb_index],phi);
+            }
+        });
+    }
+
+    // Copy back to host
+    Gpu::copy(Gpu::deviceToHost, d_freestream_velocity.begin(), d_freestream_velocity.end(), freestream_velocity.begin());
+    Gpu::copy(Gpu::deviceToHost, d_freestream_phi.begin(), d_freestream_phi.end(), freestream_phi.begin());
+    Gpu::copy(Gpu::deviceToHost, d_disk_cell_count.begin(), d_disk_cell_count.end(), disk_cell_count.begin());
+
+    // Reduce the data on every processor
+    amrex::ParallelAllReduce::Sum(freestream_velocity.data(),
+                                  freestream_velocity.size(),
+                                  amrex::ParallelContext::CommunicatorAll());
+
+    amrex::ParallelAllReduce::Sum(freestream_phi.data(),
+                                  freestream_phi.size(),
+                                  amrex::ParallelContext::CommunicatorAll());
+
+
+   amrex::ParallelAllReduce::Sum(disk_cell_count.data(),
+                                 disk_cell_count.size(),
+                                 amrex::ParallelContext::CommunicatorAll());
+
+    get_turb_loc(xloc, yloc);
+
+
+    if (ParallelDescriptor::IOProcessor()){
+        for(int it=0; it<xloc.size(); it++){
+            //std::cout << "turbine index, freestream velocity is " << it << " " << freestream_velocity[it] << " " <<
+            //                                                 disk_cell_count[it]  <<  " " <<
+            //                                                freestream_velocity[it]/(disk_cell_count[it] + 1e-10) << " " <<
+            //                                              freestream_phi[it]/(disk_cell_count[it] + 1e-10) << "\n";
+        }
+    }
+}
+
+AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE
+int find_rad_loc_index(const Real rad,
+                       const Real* bld_rad_loc,
+                       const int n_bld_sections)
+{
+      // Find the index of the radial location
+    int index=-1;
+    Real rhub = 2.0;
+    Real rad_from_hub = rad - rhub;
+    if(rad_from_hub < 0.0) {
+        index = 0;
+    }
+    else {
+        for(int i=0;i<n_bld_sections;i++){
+            if(bld_rad_loc[i] > rad) {
+                index = i;
+                break;
+            }
+        }
+    }
+    if(index == -1 and rad > bld_rad_loc[n_bld_sections-1]) {
+        index = n_bld_sections-1;
+    }
+    if(index == -1) {
+        //printf("The radial section is at %0.15g m\n",rad);
+        Abort("Could not find index of the radial section.");
+    }
+
+    return index;
+}
+
+AMREX_FORCE_INLINE
+AMREX_GPU_DEVICE
+std::array<Real,2>
+compute_source_terms_Fn_Ft (const Real rad,
+                            const Real avg_vel,
+                            const Real* bld_rad_loc,
+                            const Real* bld_twist,
+                            const Real* bld_chord,
+                            int n_bld_sections,
+                            const Real* bld_airfoil_aoa,
+                            const Real* bld_airfoil_Cl,
+                            const Real* bld_airfoil_Cd,
+                            const int n_pts_airfoil,
+                            const Real* velocity,
+                            const Real* rotor_RPM,
+                            const Real* blade_pitch,
+                            const int n_spec_extra)
+{
+
+    Real rpm = interpolate_1d(velocity, rotor_RPM, avg_vel, n_spec_extra);
+    Real pitch = interpolate_1d(velocity, blade_pitch, avg_vel, n_spec_extra);
+
+    Real Omega = rpm/60.0*2.0*PI;
+    Real rho = 1.226;
+
+    Real B = 3.0;
+    Real rhub = 2.0;
+    Real rtip = 63.5;
+
+    Real twist = interpolate_1d(bld_rad_loc, bld_twist, rad, n_bld_sections);
+    Real c = interpolate_1d(bld_rad_loc, bld_chord, rad, n_bld_sections);
+
+    // Iteration procedure
+
+    Real s = 0.5*c*B/(PI*rad);
+
+    Real at, an, V1, Vt, Vr, psi, L, D, Cn, Ct;
+    Real ftip, fhub, F, Cl, Cd, at_new, an_new;
+
+    at = 0.1;
+    an = 0.1;
+
+    bool is_converged = false;
+
+    for(int i=0;i<100;i++) {
+        V1 = avg_vel*(1-an);
+        Vt = Omega*(1.0+at)*rad;
+        Vr = std::pow(V1*V1+Vt*Vt,0.5);
+
+        psi = std::atan2(V1,Vt);
+
+        Real aoa = psi*180.0/PI - twist + pitch;
+
+        Cl = interpolate_1d(bld_airfoil_aoa, bld_airfoil_Cl, aoa, n_pts_airfoil);
+        Cd = interpolate_1d(bld_airfoil_aoa, bld_airfoil_Cd, aoa, n_pts_airfoil);
+
+        //Cl = 1.37;
+        //Cd = 0.014;
+
+        //printf("rad, aoa, Cl, Cd    = %0.15g %0.15g %0.15g %0.15g\n", rad, aoa, Cl, Cd);
+
+        Cn = Cl*std::cos(psi) + Cd*std::sin(psi);
+        Ct = Cl*std::sin(psi) - Cd*std::cos(psi);
+
+        ftip = B*(rtip-rad)/(2.0*rad*std::sin(psi)+1e-10);
+        fhub = B*(rad-rhub)/(2.0*rad*std::sin(psi)+1e-10);
+
+        AMREX_ALWAYS_ASSERT(std::fabs(std::exp(-fhub))<=1.0);
+        AMREX_ALWAYS_ASSERT(std::fabs(std::exp(-ftip))<=1.0);
+
+        F = 1.0;//2.0/PI*(std::acos(std::exp(-ftip)) + std::acos(std::exp(-fhub)) );
+
+        at_new = 1.0/ ( 4.0*F*std::sin(psi)*std::cos(psi)/(s*Ct+1e-10) - 1.0 );
+        an_new = 1.0/ ( 1.0 + 4.0*F*std::pow(std::sin(psi),2)/(s*Cn + 1e-10) );
+        at_new = std::max(0.0, at_new);
+
+        if(std::fabs(at_new-at) < 1e-5 and std::fabs(an_new-an) < 1e-5) {
+            //printf("Converged at, an = %d %0.15g %0.15g %0.15g\n",i, at, an, psi);
+            at = at_new;
+            an = an_new;
+            is_converged = true;
+            break;
+        }
+        at = at_new;
+        an = an_new;
+        //printf("Iteration, at, an = %0.15g %0.15g %0.15g\n",at, an, psi);
+    }
+
+    if(!is_converged) {
+        Abort("The iteration procedure for the generalized actuator disk did not converge. Exiting...");
+    }
+
+    // Iterations converged. Now compute Ft, Fn
+
+    L = 0.5*rho*Vr*Vr*c*Cl;
+    D = 0.5*rho*Vr*Vr*c*Cd;
+
+    Real Fn = L*std::cos(psi) + D*std::sin(psi);
+    Real Ft = L*std::sin(psi) - D*std::cos(psi);
+
+     //printf("Fn and Ft %0.15g %0.15g %0.15g %0.15g\n", L, D, std::cos(psi), std::sin(psi));
+
+    std::array<Real, 2> Fn_and_Ft;
+    Fn_and_Ft[0] = Fn;
+    Fn_and_Ft[1] = Ft;
+
+    return Fn_and_Ft;
+
+    //exit(0);
+}
+
+
 void
 GeneralAD::source_terms_cellcentered (const Geometry& geom,
                                      const MultiFab& cons_in,
@@ -58,9 +289,19 @@ GeneralAD::source_terms_cellcentered (const Geometry& geom,
 {
 
     get_turb_loc(xloc, yloc);
+
     get_turb_spec(rotor_rad, hub_height, thrust_coeff_standing,
                   wind_speed, thrust_coeff, power);
 
+    get_blade_spec(bld_rad_loc,bld_twist,bld_chord);
+
+    get_blade_airfoil_spec(bld_airfoil_aoa, bld_airfoil_Cl, bld_airfoil_Cd);
+
+    get_turb_spec_extra(velocity, C_P, C_T, rotor_RPM, blade_pitch);
+
+    Real d_hub_height = hub_height;
+    Real d_rotor_rad = rotor_rad;
+
     Gpu::DeviceVector<Real> d_xloc(xloc.size());
     Gpu::DeviceVector<Real> d_yloc(yloc.size());
     Gpu::copy(Gpu::hostToDevice, xloc.begin(), xloc.end(), d_xloc.begin());
@@ -70,6 +311,7 @@ GeneralAD::source_terms_cellcentered (const Geometry& geom,
 
   // Domain valid box
       const amrex::Box& domain = geom.Domain();
+      auto ProbLoArr = geom.ProbLoArray();
       int domlo_x = domain.smallEnd(0);
       int domhi_x = domain.bigEnd(0) + 1;
       int domlo_y = domain.smallEnd(1);
@@ -80,23 +322,85 @@ GeneralAD::source_terms_cellcentered (const Geometry& geom,
       // The order of variables are - Vabs dVabsdt, dudt, dvdt, dTKEdt
       mf_vars_generalAD.setVal(0.0);
 
-      long unsigned int nturbs = xloc.size();
+     long unsigned int nturbs = xloc.size();
 
+    // This is the angle phi in Fig. 10 in Mirocha et. al. 2014
+    // set_turb_disk angle in ERF_InitWindFarm.cpp sets this phi as
+    // the turb_disk_angle
     get_turb_disk_angle(turb_disk_angle);
-    Real nx = -std::cos(turb_disk_angle);
-    Real ny = -std::sin(turb_disk_angle);
     Real d_turb_disk_angle = turb_disk_angle;
 
-    Gpu::DeviceVector<Real> d_wind_speed(wind_speed.size());
-    Gpu::DeviceVector<Real> d_thrust_coeff(thrust_coeff.size());
+    Gpu::DeviceVector<Real> d_freestream_velocity(nturbs);
+    Gpu::DeviceVector<Real> d_disk_cell_count(nturbs);
+    Gpu::copy(Gpu::hostToDevice, freestream_velocity.begin(), freestream_velocity.end(), d_freestream_velocity.begin());
+    Gpu::copy(Gpu::hostToDevice, disk_cell_count.begin(), disk_cell_count.end(), d_disk_cell_count.begin());
+
+    Real* d_xloc_ptr = d_xloc.data();
+    Real* d_yloc_ptr = d_yloc.data();
+    Real* d_freestream_velocity_ptr = d_freestream_velocity.data();
+    Real* d_disk_cell_count_ptr     = d_disk_cell_count.data();
+
+    int n_bld_sections = bld_rad_loc.size();
+
+    Gpu::DeviceVector<Real>    d_bld_rad_loc(n_bld_sections);
+    Gpu::DeviceVector<Real>    d_bld_twist(n_bld_sections);
+    Gpu::DeviceVector<Real>    d_bld_chord(n_bld_sections);
+
+    Gpu::copy(Gpu::hostToDevice, bld_rad_loc.begin(), bld_rad_loc.end(), d_bld_rad_loc.begin());
+    Gpu::copy(Gpu::hostToDevice, bld_twist.begin(), bld_twist.end(), d_bld_twist.begin());
+    Gpu::copy(Gpu::hostToDevice, bld_chord.begin(), bld_chord.end(), d_bld_chord.begin());
 
-    // Copy data from host vectors to device vectors
-    Gpu::copy(Gpu::hostToDevice, wind_speed.begin(), wind_speed.end(), d_wind_speed.begin());
-    Gpu::copy(Gpu::hostToDevice, thrust_coeff.begin(), thrust_coeff.end(), d_thrust_coeff.begin());
+    Real* bld_rad_loc_ptr = d_bld_rad_loc.data();
+    Real* bld_twist_ptr = d_bld_twist.data();
+    Real* bld_chord_ptr = d_bld_chord.data();
 
-    const Real* wind_speed_d     = d_wind_speed.dataPtr();
-    const Real* thrust_coeff_d   = d_thrust_coeff.dataPtr();
-    const int n_spec_table = d_wind_speed.size();
+    Vector<Gpu::DeviceVector<Real>> d_bld_airfoil_aoa(n_bld_sections);
+    Vector<Gpu::DeviceVector<Real>> d_bld_airfoil_Cl(n_bld_sections);
+    Vector<Gpu::DeviceVector<Real>> d_bld_airfoil_Cd(n_bld_sections);
+
+    int n_pts_airfoil = bld_airfoil_aoa[0].size();
+
+    for(int i=0;i<n_bld_sections;i++){
+        d_bld_airfoil_aoa[i].resize(n_pts_airfoil);
+        d_bld_airfoil_Cl[i].resize(n_pts_airfoil);
+        d_bld_airfoil_Cd[i].resize(n_pts_airfoil);
+        Gpu::copy(Gpu::hostToDevice, bld_airfoil_aoa[i].begin(), bld_airfoil_aoa[i].end(), d_bld_airfoil_aoa[i].begin());
+        Gpu::copy(Gpu::hostToDevice, bld_airfoil_Cl[i].begin(), bld_airfoil_Cl[i].end(), d_bld_airfoil_Cl[i].begin());
+        Gpu::copy(Gpu::hostToDevice, bld_airfoil_Cd[i].begin(), bld_airfoil_Cd[i].end(), d_bld_airfoil_Cd[i].begin());
+    }
+
+    Vector<Real*> hp_bld_airfoil_aoa, hp_bld_airfoil_Cl, hp_bld_airfoil_Cd;
+    for (auto & v :d_bld_airfoil_aoa) {
+        hp_bld_airfoil_aoa.push_back(v.data());
+    }
+    for (auto & v :d_bld_airfoil_Cl) {
+        hp_bld_airfoil_Cl.push_back(v.data());
+    }
+    for (auto & v :d_bld_airfoil_Cd) {
+        hp_bld_airfoil_Cd.push_back(v.data());
+    }
+
+    Gpu::AsyncArray<Real*> aoa(hp_bld_airfoil_aoa.data(), n_bld_sections);
+    Gpu::AsyncArray<Real*> Cl(hp_bld_airfoil_Cl.data(), n_bld_sections);
+    Gpu::AsyncArray<Real*> Cd(hp_bld_airfoil_Cd.data(), n_bld_sections);
+
+    auto d_bld_airfoil_aoa_ptr = aoa.data();
+    auto d_bld_airfoil_Cl_ptr  = Cl.data();
+    auto d_bld_airfoil_Cd_ptr  = Cd.data();
+
+    int n_spec_extra = velocity.size();
+
+    Gpu::DeviceVector<Real> d_velocity(n_spec_extra);
+    Gpu::DeviceVector<Real> d_rotor_RPM(n_spec_extra);
+    Gpu::DeviceVector<Real> d_blade_pitch(n_spec_extra);
+
+    Gpu::copy(Gpu::hostToDevice, velocity.begin(), velocity.end(), d_velocity.begin());
+    Gpu::copy(Gpu::hostToDevice, rotor_RPM.begin(), rotor_RPM.end(), d_rotor_RPM.begin());
+    Gpu::copy(Gpu::hostToDevice, blade_pitch.begin(), blade_pitch.end(), d_blade_pitch.begin());
+
+    auto d_velocity_ptr = d_velocity.data();
+    auto d_rotor_RPM_ptr = d_rotor_RPM.data();
+    auto d_blade_pitch_ptr = d_blade_pitch.data();
 
     for ( MFIter mfi(cons_in,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
 
@@ -109,34 +413,79 @@ GeneralAD::source_terms_cellcentered (const Geometry& geom,
             int jj = amrex::min(amrex::max(j, domlo_y), domhi_y);
             int kk = amrex::min(amrex::max(k, domlo_z), domhi_z);
 
+            Real x   = ProbLoArr[0] + (ii+0.5)*dx[0];
+            Real y   = ProbLoArr[1] + (jj+0.5)*dx[1];
+            Real z   = ProbLoArr[2] + (kk+0.5)*dx[2];
+            // ?? Density needed here
+            Real inv_dens_vol = 1.0/(1.0*dx[0]*dx[1]*dx[2]);
+
             int check_int = 0;
 
-            Real source_x = 0.0;
-            Real source_y = 0.0;
+            Real source_x = 0.0, source_y = 0.0, source_z = 0.0;
+            std::array<Real,2> Fn_and_Ft;
 
             for(long unsigned int it=0;it<nturbs;it++) {
-                Real avg_vel  = 10.0;
-                Real phi      = 0.0;
+                 Real avg_vel  = d_freestream_velocity_ptr[it]/(d_disk_cell_count_ptr[it] + 1e-10);
+                 Real phi = d_turb_disk_angle;
 
-                Real C_T = interpolate_1d(wind_speed_d, thrust_coeff_d, avg_vel, n_spec_table);
-                Real a;
-                if(C_T <= 1) {
-                    a = 0.5 - 0.5*std::pow(1.0-C_T,0.5);
-                }
-                Real Uinfty_dot_nhat = avg_vel*(std::cos(phi)*nx + std::sin(phi)*ny);
-                // Add the source terms in the location of the actuator disk
+                // This if check makes sure it is a point on the actuator disk
                 if(SMark_array(ii,jj,kk,1) == static_cast<double>(it)) {
                     check_int++;
-                    if(C_T <= 1) {
-                        source_x = -2.0*std::pow(Uinfty_dot_nhat, 2.0)*a*(1.0-a)*dx[1]*dx[2]*std::cos(d_turb_disk_angle)/(dx[0]*dx[1]*dx[2])*std::cos(phi);
-                        source_y = -2.0*std::pow(Uinfty_dot_nhat, 2.0)*a*(1.0-a)*dx[1]*dx[2]*std::cos(d_turb_disk_angle)/(dx[0]*dx[1]*dx[2])*std::sin(phi);
-                    }
-                    else {
-                        source_x = -0.5*C_T*std::pow(Uinfty_dot_nhat, 2.0)*dx[1]*dx[2]*std::cos(d_turb_disk_angle)/(dx[0]*dx[1]*dx[2])*std::cos(phi);
-                        source_y = -0.5*C_T*std::pow(Uinfty_dot_nhat, 2.0)*dx[1]*dx[2]*std::cos(d_turb_disk_angle)/(dx[0]*dx[1]*dx[2])*std::sin(phi);
+
+                    // Find radial distance of the point and the zeta angle
+                    Real rad = std::pow( (x-d_xloc_ptr[it])*(x-d_xloc_ptr[it]) +
+                                         (y-d_yloc_ptr[it])*(y-d_yloc_ptr[it]) +
+                                         (z-d_hub_height)*(z-d_hub_height), 0.5 );
+
+                    int index = find_rad_loc_index(rad, bld_rad_loc_ptr, n_bld_sections);
+
+                    // This if check makes sure it is a point with radial distance
+                    // between the hub radius and the rotor radius.
+                    // ?? hub radius needed here
+                    if(rad >= 2.0 and rad <= d_rotor_rad) {
+                        //AMREX_ASSERT( (z-d_hub_height) <= rad );
+                        // Consider the vector that joines the point and the turbine center.
+                        // Dot it on to the vector that joins the turbine center and along
+                        // the plane of the disk. See fig. 10 in Mirocha et. al. 2014.
+
+                        Real vec_proj = (x-d_xloc_ptr[it])*(std::sin(phi)) +
+                                        (y-d_yloc_ptr[it])*(-std::cos(phi));
+
+
+                        Real zeta = std::atan2(z-d_hub_height, vec_proj);
+                        //printf("zeta val is %0.15g\n", zeta*180.0/PI);
+                        Fn_and_Ft = compute_source_terms_Fn_Ft(rad, avg_vel,
+                                                               bld_rad_loc_ptr,
+                                                               bld_twist_ptr,
+                                                               bld_chord_ptr,
+                                                               n_bld_sections,
+                                                               d_bld_airfoil_aoa_ptr[index],
+                                                               d_bld_airfoil_Cl_ptr[index],
+                                                               d_bld_airfoil_Cd_ptr[index],
+                                                               n_pts_airfoil,
+                                                               d_velocity_ptr,
+                                                               d_rotor_RPM_ptr,
+                                                               d_blade_pitch_ptr,
+                                                               n_spec_extra);
+
+                        Real Fn = Fn_and_Ft[0];
+                        Real Ft = Fn_and_Ft[1];
+                        // Compute the source terms - pass in radial distance, free stream velocity
+
+                        Real Fx = Fn*std::cos(phi) + Ft*std::sin(zeta)*std::sin(phi);
+                        Real Fy = Fn*std::sin(phi) - Ft*std::sin(zeta)*std::cos(phi);
+                        Real Fz = -Ft*std::cos(zeta);
+
+                        source_x = -Fx*inv_dens_vol;
+                        source_y = -Fy*inv_dens_vol;
+                        source_z = -Fz*inv_dens_vol;
+
+
+                        //printf("Val source_x, is %0.15g, %0.15g, %0.15g %0.15g %0.15g %0.15g\n", rad, Fn, Ft, source_x, source_y, source_z);
                     }
                 }
             }
+
             if(check_int > 1){
                 amrex::Error("Actuator disks are overlapping. Visualize actuator_disks.vtk "
                              "and check the windturbine locations input file. Exiting..");
@@ -144,6 +493,7 @@ GeneralAD::source_terms_cellcentered (const Geometry& geom,
 
             generalAD_array(i,j,k,0) = source_x;
             generalAD_array(i,j,k,1) = source_y;
+            generalAD_array(i,j,k,2) = source_z;
          });
     }
 }
diff --git a/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_GeneralAD.H b/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_GeneralAD.H
index f3ab40227..593b16a48 100644
--- a/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_GeneralAD.H
+++ b/Source/WindFarmParametrization/GeneralActuatorDisk/ERF_GeneralAD.H
@@ -23,6 +23,11 @@ public:
                   const amrex::MultiFab& mf_Nturb,
                   const amrex::MultiFab& mf_SMark) override;
 
+    void compute_freestream_velocity (const amrex::MultiFab& cons_in,
+                                     const amrex::MultiFab& U_old,
+                                     const amrex::MultiFab& V_old,
+                                     const amrex::MultiFab& mf_SMark);
+
     void source_terms_cellcentered (const amrex::Geometry& geom,
                                     const amrex::MultiFab& cons_in,
                                     const amrex::MultiFab& mf_Smark,
@@ -32,6 +37,7 @@ public:
                  amrex::MultiFab& cons_in,
                  amrex::MultiFab& U_old,
                  amrex::MultiFab& V_old,
+                 amrex::MultiFab& W_old,
                  const amrex::MultiFab& mf_vars);
 
 protected:
@@ -40,6 +46,9 @@ protected:
     amrex::Real hub_height, rotor_rad, thrust_coeff_standing, nominal_power;
     amrex::Vector<amrex::Real> wind_speed, thrust_coeff, power;
     amrex::Vector<amrex::Real> freestream_velocity, freestream_phi, disk_cell_count;
+    amrex::Vector<amrex::Real> bld_rad_loc, bld_twist, bld_chord;
+    amrex::Vector<amrex::Vector<amrex::Real>> bld_airfoil_aoa, bld_airfoil_Cl, bld_airfoil_Cd;
+    amrex::Vector<amrex::Real> velocity, C_P, C_T, rotor_RPM, blade_pitch;
 };
 
 #endif
diff --git a/Source/WindFarmParametrization/Null/ERF_NullWindFarm.H b/Source/WindFarmParametrization/Null/ERF_NullWindFarm.H
index 934832b19..2daea0aa5 100644
--- a/Source/WindFarmParametrization/Null/ERF_NullWindFarm.H
+++ b/Source/WindFarmParametrization/Null/ERF_NullWindFarm.H
@@ -49,6 +49,37 @@ public:
         m_turb_disk_angle = turb_disk_angle;
     }
 
+    virtual void set_blade_spec(const amrex::Vector<amrex::Real>& bld_rad_loc,
+                                const amrex::Vector<amrex::Real>& bld_twist,
+                                const amrex::Vector<amrex::Real>& bld_chord)
+    {
+        m_bld_rad_loc = bld_rad_loc;
+        m_bld_twist = bld_twist;
+        m_bld_chord = bld_chord;
+    }
+
+    virtual void set_blade_airfoil_spec(const amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_aoa,
+                                        const amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_Cl,
+                                        const amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_Cd)
+    {
+        m_bld_airfoil_aoa = bld_airfoil_aoa;
+        m_bld_airfoil_Cl = bld_airfoil_Cl;
+        m_bld_airfoil_Cd = bld_airfoil_Cd;
+    }
+
+    virtual void set_turb_spec_extra(const amrex::Vector<amrex::Real>& velocity,
+                              const amrex::Vector<amrex::Real>& C_P,
+                              const amrex::Vector<amrex::Real>& C_T,
+                              const amrex::Vector<amrex::Real>& rotor_RPM,
+                              const amrex::Vector<amrex::Real>& blade_pitch)
+    {
+        m_velocity = velocity;
+        m_C_P = C_P;
+        m_C_T = C_T;
+        m_rotor_RPM = rotor_RPM;
+        m_blade_pitch = blade_pitch;
+    }
+
     void get_turb_spec (amrex::Real& rotor_rad, amrex::Real& hub_height,
                         amrex::Real& thrust_coeff_standing, amrex::Vector<amrex::Real>& wind_speed,
                         amrex::Vector<amrex::Real>& thrust_coeff, amrex::Vector<amrex::Real>& power)
@@ -73,6 +104,36 @@ public:
         turb_disk_angle = m_turb_disk_angle;
     }
 
+    void get_blade_spec(amrex::Vector<amrex::Real>& bld_rad_loc,
+                        amrex::Vector<amrex::Real>& bld_twist,
+                        amrex::Vector<amrex::Real>& bld_chord)
+    {
+        bld_rad_loc = m_bld_rad_loc;
+        bld_twist = m_bld_twist;
+        bld_chord = m_bld_chord;
+    }
+
+    void get_blade_airfoil_spec(amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_aoa,
+                                amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_Cl,
+                                amrex::Vector<amrex::Vector<amrex::Real>>& bld_airfoil_Cd)
+    {
+        bld_airfoil_aoa = m_bld_airfoil_aoa;
+        bld_airfoil_Cl = m_bld_airfoil_Cl;
+        bld_airfoil_Cd = m_bld_airfoil_Cd;
+    }
+
+    void get_turb_spec_extra(amrex::Vector<amrex::Real>& velocity,
+                             amrex::Vector<amrex::Real>& C_P,
+                                amrex::Vector<amrex::Real>& C_T,
+                             amrex::Vector<amrex::Real>& rotor_RPM,
+                             amrex::Vector<amrex::Real>& blade_pitch)
+    {
+        velocity = m_velocity;
+        C_P = m_C_P;
+        C_T = m_C_T;
+        rotor_RPM = m_rotor_RPM;
+        blade_pitch = m_blade_pitch;
+    }
 
 static AMREX_GPU_DEVICE
 bool find_if_marked(amrex::Real x1, amrex::Real x2, amrex::Real y1, amrex::Real y2,
@@ -121,6 +182,9 @@ protected:
     amrex::Real m_turb_disk_angle;
     amrex::Real m_hub_height, m_rotor_rad, m_thrust_coeff_standing, m_nominal_power;
     amrex::Vector<amrex::Real> m_wind_speed, m_thrust_coeff, m_power;
+    amrex::Vector<amrex::Real> m_bld_rad_loc, m_bld_twist, m_bld_chord;
+    amrex::Vector<amrex::Vector<amrex::Real>> m_bld_airfoil_aoa, m_bld_airfoil_Cl, m_bld_airfoil_Cd;
+    amrex::Vector<amrex::Real> m_velocity, m_C_P, m_C_T, m_rotor_RPM, m_blade_pitch;
 };
 
 
diff --git a/Source/main.cpp b/Source/main.cpp
index aaf932679..4cd2ee0b3 100644
--- a/Source/main.cpp
+++ b/Source/main.cpp
@@ -7,14 +7,6 @@
 //#include "IO.H"
 #include "ERF.H"
 
-#ifdef ERF_USE_MULTIBLOCK
-#ifndef ERF_MB_EXTERN       // enter only if multiblock does not involve an external class
-#include <ERF_MultiBlockContainer.H>
-#else
-#include <MultiBlockContainer.H>
-#endif
-#endif
-
 #ifdef ERF_USE_WW3_COUPLING
 #include <mpi.h>
 #include <AMReX_MPMD.H>
@@ -124,101 +116,6 @@ int main (int argc, char* argv[])
     // wallclock time
     const Real strt_total = amrex::second();
 
-#ifdef ERF_USE_MULTIBLOCK
-    {
-        // Vector of constructor parameters for MultiBlock
-        std::vector<RealBox> rb_v;
-        std::vector<int> max_level_v;
-        std::vector<int> coord_v;
-        std::vector<amrex::Vector<int>> n_cell_v;
-        std::vector<amrex::Array<int,AMREX_SPACEDIM>> is_per_v;
-        std::vector<amrex::Vector<amrex::IntVect>> ref_rat_v;
-        std::vector<std::string> prefix_v;
-        int max_step{1};
-
-        // Local constructor parameters for vector
-        RealBox rb;
-        int max_level{0};
-        int coord{0};
-        amrex::Vector<int> n_cell = {1,1,1};
-        amrex::Array<int,AMREX_SPACEDIM> is_per = {1,1,1};
-        amrex::Vector<amrex::IntVect> ref_rat = {amrex::IntVect(1,1,1)};
-
-        // Parse max steps for the block
-        {
-            ParmParse pp;
-            pp.query("max_step", max_step);
-        }
-
-        // Parse data for erf1 constructor
-        {
-            ParmParse pp("erf1");
-            amrex::Vector<Real> lo  = {0.,0.,0.};
-            amrex::Vector<Real> hi  = {0.,0.,0.};
-            amrex::Vector<int> periodicity = {1,1,1};
-            pp.queryarr("prob_lo",lo);
-            pp.queryarr("prob_hi",hi);
-            rb.setLo(lo);
-            rb.setHi(hi);
-            pp.query("max_level",max_level);
-            pp.query("coord",coord);
-            pp.queryarr("n_cell",n_cell);
-            pp.queryarr("is_periodic",periodicity);
-            {
-                for( int i(0); i<AMREX_SPACEDIM; i++ ) is_per[i] = periodicity[i];
-            }
-            pp.queryarr("ref_ratio",ref_rat);
-
-            rb_v.push_back(rb);
-            max_level_v.push_back(max_level);
-            coord_v.push_back(coord);
-            n_cell_v.push_back(n_cell);
-            is_per_v.push_back(is_per);
-            ref_rat_v.push_back(ref_rat);
-            prefix_v.push_back("erf1");
-        }
-
-        // Parse data for erf2 constructor
-        {
-            ParmParse pp("erf2");
-            amrex::Vector<Real> lo  = {0.,0.,0.};
-            amrex::Vector<Real> hi  = {0.,0.,0.};
-            amrex::Vector<int> periodicity = {1,1,1};
-            pp.queryarr("prob_lo",lo);
-            pp.queryarr("prob_hi",hi);
-            rb.setLo(lo);
-            rb.setHi(hi);
-            pp.query("max_level",max_level);
-            pp.query("coord",coord);
-            pp.queryarr("n_cell",n_cell);
-            pp.queryarr("is_periodic",periodicity);
-            {
-                for( int i(0); i<AMREX_SPACEDIM; i++ ) is_per[i] = periodicity[i];
-            }
-            pp.queryarr("ref_ratio",ref_rat);
-
-            rb_v.push_back(rb);
-            max_level_v.push_back(max_level);
-            coord_v.push_back(coord);
-            n_cell_v.push_back(n_cell);
-            is_per_v.push_back(is_per);
-            ref_rat_v.push_back(ref_rat);
-            prefix_v.push_back("erf2");
-
-        }
-
-        // Construct a MultiBlockContainer
-        MultiBlockContainer mbc(rb_v, max_level_v, n_cell_v,
-                                coord_v, ref_rat_v, is_per_v,
-                                prefix_v, max_step);
-
-        // Initialize data
-        mbc.InitializeBlocks();
-
-        // Advance blocks a timestep
-        mbc.AdvanceBlocks();
-    }
-#else
     {
         // constructor - reads in parameters from inputs file
         //             - sizes multilevel arrays and data structures
@@ -239,7 +136,6 @@ int main (int argc, char* argv[])
             amrex::Print() << "\nTotal Time: " << end_total << '\n';
         }
     }
-#endif
 
     // destroy timer for profiling
     BL_PROFILE_VAR_STOP(pmain);
diff --git a/Submodules/AMReX b/Submodules/AMReX
index 6239d2576..c333708d5 160000
--- a/Submodules/AMReX
+++ b/Submodules/AMReX
@@ -1 +1 @@
-Subproject commit 6239d2576489a3178d753471f988908f5499577c
+Subproject commit c333708d59f01ab664363ba426e85ad24c1fb23d
diff --git a/Submodules/NOAH-MP b/Submodules/NOAH-MP
new file mode 160000
index 000000000..482646ac5
--- /dev/null
+++ b/Submodules/NOAH-MP
@@ -0,0 +1 @@
+Subproject commit 482646ac5641492048973f7706db57145a3db11a
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 1e1dbaf82..4a5aa706a 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -1,7 +1,12 @@
-if(WIN32)
-    set(FCOMPARE_EXE ${CMAKE_BINARY_DIR}/Submodules/AMReX/Tools/Plotfile/*/amrex_fcompare.exe CACHE INTERNAL "Path to fcompare executable for regression tests")
-else()
-    set(FCOMPARE_EXE ${CMAKE_BINARY_DIR}/Submodules/AMReX/Tools/Plotfile/amrex_fcompare CACHE INTERNAL "Path to fcompare executable for regression tests")
-endif()
+# Additional testing options
 set(ERF_TEST_NRANKS 2 CACHE STRING  "Number of MPI ranks to use for each test")
+set(ERF_TEST_FCOMPARE_RTOL "2.0e-10" CACHE STRING "fcompare relative tolerance")
+set(ERF_TEST_FCOMPARE_ATOL "2.0e-10" CACHE STRING "fcompare absolute tolerance")
+
+message(STATUS "ERF testing configuration summary:")
+message(STATUS "   Number of ranks               = ${ERF_TEST_NRANKS}")
+message(STATUS "   fcompare executable           = ${FCOMPARE_EXE}")
+message(STATUS "   comparison relative tolerance = ${ERF_TEST_FCOMPARE_RTOL}")
+message(STATUS "   comparison absolute tolerance = ${ERF_TEST_FCOMPARE_ATOL}")
+
 include(${CMAKE_CURRENT_SOURCE_DIR}/CTestList.cmake)
diff --git a/Tests/CTestList.cmake b/Tests/CTestList.cmake
index 62428e1f5..231bdf855 100644
--- a/Tests/CTestList.cmake
+++ b/Tests/CTestList.cmake
@@ -43,7 +43,7 @@ function(add_test_r TEST_NAME TEST_EXE PLTFILE)
     endif()
 
     set(TEST_EXE ${CMAKE_BINARY_DIR}/Exec/${TEST_EXE})
-    set(FCOMPARE_TOLERANCE "-r 2e-10 --abs_tol 2.0e-10")
+    set(FCOMPARE_TOLERANCE "-r ${ERF_TEST_FCOMPARE_RTOL} --abs_tol ${ERF_TEST_FCOMPARE_ATOL}")
     set(FCOMPARE_FLAGS "--abort_if_not_all_found -a ${FCOMPARE_TOLERANCE}")
     set(test_command sh -c "${MPI_COMMANDS} ${TEST_EXE} ${CURRENT_TEST_BINARY_DIR}/${TEST_NAME}.i ${RUNTIME_OPTIONS} > ${TEST_NAME}.log && ${MPI_FCOMP_COMMANDS} ${FCOMPARE_EXE} ${FCOMPARE_FLAGS} ${PLOT_GOLD} ${CURRENT_TEST_BINARY_DIR}/${PLTFILE}")
 
@@ -85,7 +85,7 @@ function(add_test_0 TEST_NAME TEST_EXE PLTFILE)
     set(TEST_EXE ${CMAKE_BINARY_DIR}/Exec/${TEST_EXE})
     set(FCOMPARE_TOLERANCE "-r 1e-14 --abs_tol 1.0e-14")
     set(FCOMPARE_FLAGS "-a ${FCOMPARE_TOLERANCE}")
-    set(test_command sh -c "${MPI_COMMANDS} ${TEST_EXE} ${CURRENT_TEST_BINARY_DIR}/${TEST_NAME}.i erf.input_sounding_file=${CURRENT_TEST_BINARY_DIR}/input_sounding > ${TEST_NAME}.log && ${FCOMPARE_EXE} ${FCOMPARE_FLAGS} ${CURRENT_TEST_BINARY_DIR}/plt00000 ${CURRENT_TEST_BINARY_DIR}/${PLTFILE}")
+    set(test_command sh -c "${MPI_COMMANDS} ${TEST_EXE} ${CURRENT_TEST_BINARY_DIR}/${TEST_NAME}.i erf.input_sounding_file=${CURRENT_TEST_BINARY_DIR}/input_sounding > ${TEST_NAME}.log && ${MPI_FCOMP_COMMANDS} ${FCOMPARE_EXE} ${FCOMPARE_FLAGS} ${CURRENT_TEST_BINARY_DIR}/plt00000 ${CURRENT_TEST_BINARY_DIR}/${PLTFILE}")
 
     add_test(${TEST_NAME} ${test_command})
     set_tests_properties(${TEST_NAME}