From 9f7e9ebd8330af9feefdc214e316186695bc5bd2 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 17 Dec 2024 10:57:34 -0800
Subject: [PATCH] GMRESMLMG: Support Multi-level composite solve

---
 Src/Base/AMReX_FabArrayUtility.H              | 203 +++++++++++++++++
 Src/Base/AMReX_Vector.H                       |  18 ++
 Src/LinearSolvers/AMReX_GMRES_MLMG.H          | 214 ++++++++++--------
 Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H     |  22 +-
 Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H    |  76 +++++++
 .../MLMG/AMReX_MLEBABecLap_F.cpp              |   2 +-
 .../MLMG/AMReX_MLEBNodeFDLaplacian.H          |   4 +-
 .../MLMG/AMReX_MLEBNodeFDLaplacian.cpp        |  50 ++--
 Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp |   2 +-
 Src/LinearSolvers/MLMG/AMReX_MLLinOp.H        |  34 ++-
 Src/LinearSolvers/MLMG/AMReX_MLMG.H           | 186 ++++++++-------
 .../MLMG/AMReX_MLNodeLaplacian_sync.cpp       |   4 +-
 Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H    |   8 +-
 Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp  |  95 ++++++--
 .../MLMG/AMReX_MLNodeLinOp_1D_K.H             |  35 +++
 .../MLMG/AMReX_MLNodeLinOp_2D_K.H             |  57 +++++
 .../MLMG/AMReX_MLNodeLinOp_3D_K.H             |  82 +++++++
 .../LinearSolvers/ABecLaplacian_C/MyTest.cpp  | 110 ++++++---
 Tests/LinearSolvers/CellEB/GNUmakefile        |   2 +-
 Tests/LinearSolvers/CellEB/MyTest.H           |   3 +
 Tests/LinearSolvers/CellEB/MyTest.cpp         |  41 +++-
 Tests/LinearSolvers/CurlCurl/MyTest.cpp       |   5 -
 Tests/LinearSolvers/NodalPoisson/MyTest.cpp   |  11 +-
 23 files changed, 990 insertions(+), 274 deletions(-)
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index bfac76c471e..6df3edd827a 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -1602,6 +1602,209 @@ Dot (FabArray<FAB> const& x, int xcomp, FabArray<FAB> const& y, int ycomp, int n
     return sm;
 }
 
+/**
+ * \brief Compute dot product of FabArray with itself
+ *
+ * \param x      first FabArray
+ * \param xcomp  starting component of x
+ * \param y      second FabArray
+ * \param ycomp  starting component of y
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename FAB, std::enable_if_t<IsBaseFab<FAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<FAB> const& x, int xcomp, int ncomp, IntVect const& nghost, bool local = false)
+{
+    BL_ASSERT(x.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& xma = x.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            auto const& xfab = xma[box_no];
+            for (int n = 0; n < ncomp; ++n) {
+                auto v = xfab(i,j,k,xcomp+n);
+                t += v*v;
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& xfab = x.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                auto v = xfab(i,j,k,xcomp+n);
+                sm += v*v;
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
+/**
+ * \brief Compute dot product of two FabArrays in region that mask is true
+ *
+ * \param mask   mask
+ * \param x      first FabArray
+ * \param xcomp  starting component of x
+ * \param y      second FabArray
+ * \param ycomp  starting component of y
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename IFAB, typename FAB,
+          std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp,
+     FabArray<FAB> const& y, int ycomp, int ncomp, IntVect const& nghost,
+     bool local = false)
+{
+    BL_ASSERT(x.boxArray() == y.boxArray() && x.boxArray() == mask.boxArray());
+    BL_ASSERT(x.DistributionMap() == y.DistributionMap() && x.DistributionMap() == mask.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost) &&
+              mask.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& mma = mask.const_arrays();
+        auto const& xma = x.const_arrays();
+        auto const& yma = y.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            auto m = T(mma[box_no](i,j,k));
+            if (m != 0) {
+                auto const& xfab = xma[box_no];
+                auto const& yfab = yma[box_no];
+                for (int n = 0; n < ncomp; ++n) {
+                    t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+                }
+            }
+            return t*m;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& mfab = mask.const_array(mfi);
+            auto const& xfab = x.const_array(mfi);
+            auto const& yfab = y.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                auto m = T(mfab(i,j,k));
+                sm += m * xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
+/**
+ * \brief Compute dot product of FabArray with itself in region that mask is true
+ *
+ * \param mask   mask
+ * \param x      FabArray
+ * \param xcomp  starting component of x
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename IFAB, typename FAB,
+          std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp, int ncomp,
+     IntVect const& nghost, bool local = false)
+{
+    BL_ASSERT(x.boxArray() == mask.boxArray());
+    BL_ASSERT(x.DistributionMap() == mask.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && mask.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& mma = mask.const_arrays();
+        auto const& xma = x.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            auto m = T(mma[box_no](i,j,k));
+            if (m != 0) {
+                auto const& xfab = xma[box_no];
+                for (int n = 0; n < ncomp; ++n) {
+                    auto v = xfab(i,j,k,xcomp+n);
+                    t += v*v;
+                }
+            }
+            return t*m;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& mfab = mask.const_array(mfi);
+            auto const& xfab = x.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                auto m = T(mfab(i,j,k));
+                auto v = xfab(i,j,k,xcomp+n);
+                sm += m*v*v;
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
 //! dst = val
 template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
 void setVal (MF& dst, typename MF::value_type val)
diff --git a/Src/Base/AMReX_Vector.H b/Src/Base/AMReX_Vector.H
index c468befa94b..4196e8de8ff 100644
--- a/Src/Base/AMReX_Vector.H
+++ b/Src/Base/AMReX_Vector.H
@@ -66,6 +66,15 @@ namespace amrex
         return r;
     }
 
+    template <class T, std::size_t N, typename = typename T::FABType>
+    [[nodiscard]] Vector<Array<T,N>*> GetVecOfPtrs (Vector<Array<T,N>>& a)
+    {
+        Vector<Array<T,N>*> r;
+        r.reserve(a.size());
+        for (auto& x : a) { r.push_back(&x); }
+        return r;
+    }
+
     template <class T>
     [[nodiscard]] Vector<T*> GetVecOfPtrs (const Vector<std::unique_ptr<T> >& a)
     {
@@ -86,6 +95,15 @@ namespace amrex
         return r;
     }
 
+    template <class T, std::size_t N, typename = typename T::FABType>
+    [[nodiscard]] Vector<Array<T,N> const*> GetVecOfConstPtrs (Vector<Array<T,N>> const& a)
+    {
+        Vector<Array<T,N> const*> r;
+        r.reserve(a.size());
+        for (auto& x : a) { r.push_back(&x); }
+        return r;
+    }
+
     template <class T>
     [[nodiscard]] Vector<const T*> GetVecOfConstPtrs (const Vector<std::unique_ptr<T> >& a)
     {
diff --git a/Src/LinearSolvers/AMReX_GMRES_MLMG.H b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
index 56ee477250e..ebc8d4be3b5 100644
--- a/Src/LinearSolvers/AMReX_GMRES_MLMG.H
+++ b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
@@ -4,6 +4,7 @@
 
 #include <AMReX_GMRES.H>
 #include <AMReX_MLMG.H>
+#include <AMReX_TypeTraits.H>
 #include <utility>
 
 namespace amrex {
@@ -19,9 +20,10 @@ template <typename MF>
 class GMRESMLMGT
 {
 public:
+    using VEC = Vector<MF>;
     using MG = MLMGT<MF>;
     using RT = typename MG::RT; // double or float
-    using GM = GMRES<MF,GMRESMLMGT<MF>>;
+    using GM = GMRES<Vector<MF>,GMRESMLMGT<MF>>;
 
     explicit GMRESMLMGT (MG& mlmg);
 
@@ -35,8 +37,13 @@ public:
      */
     void solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs);
 
+    void solve (Vector<MF*> const& a_sol, Vector<MF const*> const& a_rhs, RT a_tol_rel, RT a_tol_abs);
+
     //! Sets verbosity.
-    void setVerbose (int v) { m_gmres.setVerbose(v); }
+    void setVerbose (int v) {
+        m_verbose = v;
+        m_gmres.setVerbose(v);
+    }
 
     //! Sets the max number of iterations
     void setMaxIters (int niters) { m_gmres.setMaxIters(niters); }
@@ -50,46 +57,34 @@ public:
     //! Get the GMRES object.
     GM& getGMRES () { return m_gmres; }
 
-    /**
-     * \brief Set MLMG's multiplicative property of zero
-     *
-     * This should NOT be called unless MLMG has the multiplicative property
-     * of zero. MLMG is not a matrix, and usually does not have the
-     * properties of a matrix such as the multiplicative property of zero
-     * (i.e., M*0=0) because how domain boundary conditions are
-     * handled. However, if MLMG has the property of zero, calling this
-     * function with true can have a small performance benefit.
-     */
-    void setPropertyOfZero (bool b) { m_prop_zero = b; }
-
     //! Make MultiFab without ghost cells
-    MF makeVecRHS () const;
+    VEC makeVecRHS () const;
 
     //! Make MultiFab with ghost cells and set ghost cells to zero
-    MF makeVecLHS () const;
+    VEC makeVecLHS () const;
 
-    RT norm2 (MF const& mf) const;
+    RT norm2 (VEC const& mf) const;
 
-    static void scale (MF& mf, RT scale_factor);
+    static void scale (VEC& mf, RT scale_factor);
 
-    RT dotProduct (MF const& mf1, MF const& mf2) const;
+    RT dotProduct (VEC const& mf1, VEC const& mf2) const;
 
     //! lhs = 0
-    static void setToZero (MF& lhs);
+    static void setToZero (VEC& lhs);
 
     //! lhs = rhs
-    static void assign (MF& lhs, MF const& rhs);
+    static void assign (VEC& lhs, VEC const& rhs);
 
     //! lhs += a*rhs
-    static void increment (MF& lhs, MF const& rhs, RT a);
+    static void increment (VEC& lhs, VEC const& rhs, RT a);
 
     //! lhs = a*rhs_a + b*rhs_b
-    static void linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b);
+    static void linComb (VEC& lhs, RT a, VEC const& rhs_a, RT b, VEC const& rhs_b);
 
     //! lhs = L(rhs)
-    void apply (MF& lhs, MF const& rhs) const;
+    void apply (VEC& lhs, VEC const& rhs) const;
 
-    void precond (MF& lhs, MF const& rhs) const;
+    void precond (VEC& lhs, VEC const& rhs) const;
 
     //! Control whether or not to use MLMG as preconditioner.
     bool usePrecond (bool new_flag) { return std::exchange(m_use_precond, new_flag); }
@@ -101,132 +96,171 @@ private:
     GM m_gmres;
     MG* m_mlmg;
     MLLinOpT<MF>* m_linop;
+    RT m_precon_reltol = 1.e-4;
+    int m_verbose = 0;
+    int m_nlevels = 0;
     bool m_use_precond = true;
-    bool m_prop_zero = false;
     int m_precond_niters = 1;
 };
 
 template <typename MF>
 GMRESMLMGT<MF>::GMRESMLMGT (MG& mlmg)
-    : m_mlmg(&mlmg), m_linop(&mlmg.getLinOp())
+    : m_mlmg(&mlmg), m_linop(&mlmg.getLinOp()), m_nlevels(m_linop->NAMRLevels())
 {
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_linop->NAMRLevels() == 1,
-                                     "Only support single level solve");
-    m_mlmg->setVerbose(0);
-    m_mlmg->setBottomVerbose(0);
-    m_mlmg->prepareForGMRES();
+    m_mlmg->preparePrecond();
     m_gmres.define(*this);
 }
 
 template <typename MF>
-auto GMRESMLMGT<MF>::makeVecRHS () const -> MF
+auto GMRESMLMGT<MF>::makeVecRHS () const -> VEC
 {
-    return m_linop->make(0, 0, IntVect(0));
+    VEC vmf(m_nlevels);
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        vmf[ilev] = m_linop->make(ilev, 0, IntVect(0));
+    }
+    return vmf;
 }
 
 template <typename MF>
-auto GMRESMLMGT<MF>::makeVecLHS () const -> MF
+auto GMRESMLMGT<MF>::makeVecLHS () const -> VEC
 {
-    auto mf = m_linop->make(0, 0, IntVect(1));
-    setBndry(mf, RT(0), 0, nComp(mf));
-    return mf;
+    VEC vmf(m_nlevels);
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        vmf[ilev] = m_linop->make(ilev, 0, IntVect(1));
+        setBndry(vmf[ilev], RT(0), 0, nComp(vmf[ilev]));
+    }
+    return vmf;
 }
 
 template <typename MF>
-auto GMRESMLMGT<MF>::norm2 (MF const& mf) const -> RT
+auto GMRESMLMGT<MF>::norm2 (VEC const& mf) const -> RT
 {
-    auto r = m_linop->xdoty(0, 0, mf, mf, false);
-    return std::sqrt(r);
+    return m_linop->norm2Precond(GetVecOfConstPtrs(mf));
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::scale (MF& mf, RT scale_factor)
+void GMRESMLMGT<MF>::scale (VEC& mf, RT scale_factor)
 {
-    Scale(mf, scale_factor, 0, nComp(mf), 0);
+    for (auto& xmf : mf) {
+        Scale(xmf, scale_factor, 0, nComp(xmf), 0);
+    }
 }
 
 template <typename MF>
-auto GMRESMLMGT<MF>::dotProduct (MF const& mf1, MF const& mf2) const -> RT
+auto GMRESMLMGT<MF>::dotProduct (VEC const& mf1, VEC const& mf2) const -> RT
 {
-    return m_linop->xdoty(0, 0, mf1, mf2, false);
+    return m_linop->dotProductPrecond(GetVecOfConstPtrs(mf1), GetVecOfConstPtrs(mf2));
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::setToZero (MF& lhs)
+void GMRESMLMGT<MF>::setToZero (VEC& lhs)
 {
-    setVal(lhs, RT(0.0));
+    for (auto& xmf : lhs) {
+        setVal(xmf, RT(0.0));
+    }
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::assign (MF& lhs, MF const& rhs)
+void GMRESMLMGT<MF>::assign (VEC& lhs, VEC const& rhs)
 {
-    LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+    auto nlevels = int(lhs.size());
+    for (int ilev = 0; ilev < nlevels; ++ilev) {
+        LocalCopy(lhs[ilev], rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+    }
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::increment (MF& lhs, MF const& rhs, RT a)
+void GMRESMLMGT<MF>::increment (VEC& lhs, VEC const& rhs, RT a)
 {
-    Saxpy(lhs, a, rhs, 0, 0, nComp(lhs), IntVect(0));
+    auto nlevels = int(lhs.size());
+    for (int ilev = 0; ilev < nlevels; ++ilev) {
+        Saxpy(lhs[ilev], a, rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+    }
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b)
+void GMRESMLMGT<MF>::linComb (VEC& lhs, RT a, VEC const& rhs_a, RT b, VEC const& rhs_b)
 {
-    LinComb(lhs, a, rhs_a, 0, b, rhs_b, 0, 0, nComp(lhs), IntVect(0));
+    auto nlevels = int(lhs.size());
+    for (int ilev = 0; ilev < nlevels; ++ilev) {
+        LinComb(lhs[ilev], a, rhs_a[ilev], 0, b, rhs_b[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+    }
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::apply (MF& lhs, MF const& rhs) const
+void GMRESMLMGT<MF>::apply (VEC& lhs, VEC const& rhs) const
 {
-    m_linop->apply(0, 0, lhs, const_cast<MF&>(rhs),
-                   MLLinOpT<MF>::BCMode::Homogeneous,
-                   MLLinOpT<MF>::StateMode::Correction);
+    m_mlmg->applyPrecond(GetVecOfPtrs(lhs), GetVecOfPtrs(const_cast<VEC&>(rhs)));
 }
 
 template <typename MF>
-void GMRESMLMGT<MF>::precond (MF& lhs, MF const& rhs) const
+void GMRESMLMGT<MF>::precond (VEC& lhs, VEC const& rhs) const
 {
     if (m_use_precond) {
-        m_mlmg->prepareMGcycle();
-
-        for (int icycle = 0; icycle < m_precond_niters; ++icycle) {
-            if (icycle == 0) {
-                LocalCopy(m_mlmg->res[0][0], rhs, 0, 0, nComp(rhs), IntVect(0));
-            } else {
-                m_mlmg->computeResOfCorrection(0,0);
-                LocalCopy(m_mlmg->res[0][0], m_mlmg->rescor[0][0], 0, 0, nComp(rhs), IntVect(0));
-            }
-
-            m_mlmg->mgVcycle(0,0);
-
-            if (icycle == 0) {
-                LocalCopy(lhs, m_mlmg->cor[0][0], 0, 0, nComp(rhs), IntVect(0));
-            } else {
-                increment(lhs, m_mlmg->cor[0][0], RT(1));
-            }
-        }
+        m_mlmg->setPrecondIter(m_precond_niters);
+        setToZero(lhs);
+        m_mlmg->precond(GetVecOfPtrs(lhs), GetVecOfConstPtrs(rhs), 0, 0);
     } else {
-        LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+        for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+            LocalCopy(lhs[ilev], rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+        }
     }
 }
 
 template <typename MF>
 void GMRESMLMGT<MF>::solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs)
 {
-    if (m_prop_zero) {
-        auto rhs = makeVecRHS();
-        assign(rhs, a_rhs);
-        m_linop->setDirichletNodesToZero(0,0,rhs);
-        m_gmres.solve(a_sol, rhs, a_tol_rel, a_tol_abs);
-    } else {
-        auto res = makeVecRHS();
-        m_mlmg->apply({&res}, {&a_sol}); // res = L(sol)
-        increment(res, a_rhs, RT(-1)); // res = L(sol) - rhs
-        auto cor = makeVecLHS();
-        m_linop->setDirichletNodesToZero(0,0,res);
-        m_gmres.solve(cor, res, a_tol_rel, a_tol_abs); // L(cor) = res
-        increment(a_sol, cor, RT(-1)); // sol = sol - cor
+    AMREX_ALWAYS_ASSERT(m_nlevels == 1);
+    this->solve({&a_sol}, {&a_rhs}, a_tol_rel, a_tol_abs);
+}
+
+template <typename MF>
+void GMRESMLMGT<MF>::solve (Vector<MF*> const& a_sol, Vector<MF const*> const& a_rhs, RT a_tol_rel, RT a_tol_abs)
+{
+    m_mlmg->incPrintIdentation();
+    auto mlmg_verbose = m_mlmg->getVerbose();
+    auto mlmg_bottom_verbose = m_mlmg->getBottomVerbose();
+    m_mlmg->setVerbose(m_verbose);
+    auto mlmg_bottom_solver = m_mlmg->getBottomSolver();
+
+    if (mlmg_bottom_solver != BottomSolver::smoother &&
+        mlmg_bottom_solver != BottomSolver::hypre &&
+        mlmg_bottom_solver != BottomSolver::petsc)
+    {
+        m_mlmg->setBottomSolver(BottomSolver::smoother);
     }
+
+    auto res = makeVecLHS();
+    auto cor = makeVecLHS();
+
+    m_mlmg->apply(GetVecOfPtrs(res), a_sol); // res = L(sol)
+    // res = L(sol) - rhs
+    bool need_to_scale_rhs = m_linop->scaleRHS(0,nullptr);
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        MF const* prhs;
+        if (need_to_scale_rhs) {
+            LocalCopy(cor[ilev], *a_rhs[ilev], 0, 0, nComp(cor[ilev]), IntVect(0));
+            auto r = m_linop->scaleRHS(ilev, &(cor[ilev]));
+            amrex::ignore_unused(r);
+            prhs = &(cor[ilev]);
+        } else {
+            prhs = a_rhs[ilev];
+        }
+        Saxpy(res[ilev], RT(-1), *prhs, 0, 0, nComp(res[ilev]), IntVect(0));
+    }
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        m_linop->setDirichletNodesToZero(ilev,0,res[ilev]);
+    }
+    m_gmres.solve(cor, res, a_tol_rel, a_tol_abs); // L(cor) = res
+    // sol = sol - cor
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        Saxpy(*a_sol[ilev], RT(-1), cor[ilev], 0, 0, nComp(*a_sol[ilev]), IntVect(0));
+    }
+
+    m_mlmg->setBottomSolver(mlmg_bottom_solver);
+    m_mlmg->setVerbose(mlmg_verbose);
+    m_mlmg->setBottomVerbose(mlmg_bottom_verbose);
+    m_mlmg->decPrintIdentation();
 }
 
 using GMRESMLMG = GMRESMLMGT<MultiFab>;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index 30d0b96a6fd..11f4fc42c99 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -42,6 +42,7 @@ public:
     void setMaxIter (int _maxiter) { maxiter = _maxiter; }
     [[nodiscard]] int getMaxIter () const { return maxiter; }
 
+    void setPrintIdentation (std::string s) { print_ident = std::move(s); }
 
     /**
     * Is the initial guess provided to the solver zero ?
@@ -73,6 +74,7 @@ private:
     IntVect nghost = IntVect(0);
     int iter = -1;
     bool initial_vec_zeroed = false;
+    std::string print_ident;
 };
 
 template <typename MF>
@@ -133,7 +135,7 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( verbose > 0 )
     {
-        amrex::Print() << "MLCGSolver_BiCGStab: Initial error (error0) =        " << rnorm0 << '\n';
+        amrex::Print() << print_ident << "MLCGSolver_BiCGStab: Initial error (error0) =        " << rnorm0 << '\n';
     }
     int ret = 0;
     iter = 1;
@@ -143,7 +145,7 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
     {
         if ( verbose > 0 )
         {
-            amrex::Print() << "MLCGSolver_BiCGStab: niter = 0,"
+            amrex::Print() << print_ident << "MLCGSolver_BiCGStab: niter = 0,"
                            << ", rnorm = " << rnorm
                            << ", eps_abs = " << eps_abs << '\n';
         }
@@ -186,7 +188,7 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
         if ( verbose > 2 && ParallelDescriptor::IOProcessor() )
         {
-            amrex::Print() << "MLCGSolver_BiCGStab: Half Iter "
+            amrex::Print() << print_ident << "MLCGSolver_BiCGStab: Half Iter "
                            << std::setw(11) << iter
                            << " rel. err. "
                            << rnorm/(rnorm0) << '\n';
@@ -222,7 +224,7 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
         if ( verbose > 2 )
         {
-            amrex::Print() << "MLCGSolver_BiCGStab: Iteration "
+            amrex::Print() << print_ident << "MLCGSolver_BiCGStab: Iteration "
                            << std::setw(11) << iter
                            << " rel. err. "
                            << rnorm/(rnorm0) << '\n';
@@ -239,7 +241,7 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( verbose > 0 )
     {
-        amrex::Print() << "MLCGSolver_BiCGStab: Final: Iteration "
+        amrex::Print() << print_ident << "MLCGSolver_BiCGStab: Final: Iteration "
                        << std::setw(4) << iter
                        << " rel. err. "
                        << rnorm/(rnorm0) << '\n';
@@ -303,7 +305,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( verbose > 0 )
     {
-        amrex::Print() << "MLCGSolver_CG: Initial error (error0) :        " << rnorm0 << '\n';
+        amrex::Print() << print_ident << "MLCGSolver_CG: Initial error (error0) :        " << rnorm0 << '\n';
     }
 
     RT rho_1 = 0;
@@ -313,7 +315,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
     if ( rnorm0 == 0 || rnorm0 < eps_abs )
     {
         if ( verbose > 0 ) {
-            amrex::Print() << "MLCGSolver_CG: niter = 0,"
+            amrex::Print() << print_ident << "MLCGSolver_CG: niter = 0,"
                            << ", rnorm = " << rnorm
                            << ", eps_abs = " << eps_abs << '\n';
         }
@@ -352,7 +354,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
         if ( verbose > 2 )
         {
-            amrex::Print() << "MLCGSolver_cg:"
+            amrex::Print() << print_ident << "MLCGSolver_cg:"
                            << " iter " << iter
                            << " rho " << rho
                            << " alpha " << alpha << '\n';
@@ -363,7 +365,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
         if ( verbose > 2 )
         {
-            amrex::Print() << "MLCGSolver_cg:       Iteration"
+            amrex::Print() << print_ident << "MLCGSolver_cg:       Iteration"
                            << std::setw(4) << iter
                            << " rel. err. "
                            << rnorm/(rnorm0) << '\n';
@@ -376,7 +378,7 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     if ( verbose > 0 )
     {
-        amrex::Print() << "MLCGSolver_cg: Final Iteration"
+        amrex::Print() << print_ident << "MLCGSolver_cg: Final Iteration"
                        << std::setw(4) << iter
                        << " rel. err. "
                        << rnorm/(rnorm0) << '\n';
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index b318b318eb2..3138c89450b 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -113,6 +113,10 @@ public:
 
     RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const final;
 
+    RT dotProductPrecond (Vector<MF const*> const& x, Vector<MF const*> const& y) const final;
+
+    RT norm2Precond (Vector<MF const*> const& x) const final;
+
     virtual void Fapply (int amrlev, int mglev, MF& out, const MF& in) const = 0;
     virtual void Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const = 0;
     virtual void FFlux (int amrlev, const MFIter& mfi,
@@ -130,6 +134,9 @@ public:
 
     void avgDownResAmr (int clev, MF& cres, MF const& fres) const override;
 
+    void beginPrecondBC () override;
+    void endPrecondBC () override;
+
     struct BCTL {
         BoundCond type;
         RT location;
@@ -151,6 +158,8 @@ protected:
     Vector<std::unique_ptr<MLMGBndryT<MF>> > m_bndry_cor;
     Vector<std::unique_ptr<BndryRegisterT<MF>> > m_crse_cor_br;
 
+    Vector<std::unique_ptr<MLMGBndryT<MF>> > m_bndry_sol_zero;
+
     // In case of agglomeration, coarse MG grids on amr level 0 are
     // not simply coarsened from fine MG grids.  So we need to build
     // bcond and bcloc for each MG level.
@@ -1939,6 +1948,37 @@ MLCellLinOpT<MF>::xdoty (int /*amrlev*/, int /*mglev*/, const MF& x, const MF& y
     return result;
 }
 
+template <typename MF>
+auto
+MLCellLinOpT<MF>::dotProductPrecond (Vector<MF const*> const& x, Vector<MF const*> const& y) const -> RT
+{
+    const int ncomp = this->getNComp();
+    const IntVect nghost(0);
+    RT result = 0;
+    for (int ilev = 0; ilev < this->NAMRLevels()-1; ++ilev) {
+        result += amrex::Dot(*m_norm_fine_mask[ilev], *x[ilev], 0, *y[ilev], 0, ncomp, nghost, true);
+    }
+    result += amrex::Dot(*x[this->NAMRLevels()-1], 0,
+                         *y[this->NAMRLevels()-1], 0, ncomp, nghost, true);
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return result;
+}
+
+template <typename MF>
+auto
+MLCellLinOpT<MF>::norm2Precond (Vector<MF const*> const& x) const -> RT
+{
+    const int ncomp = this->getNComp();
+    const IntVect nghost(0);
+    RT result = 0;
+    for (int ilev = 0; ilev < this->NAMRLevels()-1; ++ilev) {
+        result += amrex::Dot(*m_norm_fine_mask[ilev], *x[ilev], 0, ncomp, nghost, true);
+    }
+    result += amrex::Dot(*x[this->NAMRLevels()-1], 0, ncomp, nghost, true);
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return std::sqrt(result);
+}
+
 template <typename MF>
 void
 MLCellLinOpT<MF>::computeVolInv () const
@@ -2141,6 +2181,42 @@ MLCellLinOpT<MF>::avgDownResAmr (int clev, MF& cres, MF const& fres) const
     }
 }
 
+template <typename MF>
+void
+MLCellLinOpT<MF>::beginPrecondBC ()
+{
+    this->m_precond_mode = true;
+
+    if (m_bndry_sol_zero.empty()) {
+        m_bndry_sol_zero.resize(m_bndry_sol.size());
+        const int ncomp = this->getNComp();
+        for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
+            m_bndry_sol_zero[amrlev] = std::make_unique<MLMGBndryT<MF>>
+                (this->m_grids[amrlev][0],
+                 this->m_dmap[amrlev][0],
+                 ncomp,
+                 this->m_geom[amrlev][0]);
+        }
+        std::swap(m_bndry_sol, m_bndry_sol_zero);
+        MF const* coarse_data_for_bc_save = this->m_coarse_data_for_bc;
+        this->m_coarse_data_for_bc = nullptr;
+        for (int amrlev = 0; amrlev < this->m_num_amr_levels; ++amrlev) {
+            this->setLevelBC(amrlev, nullptr);
+        }
+        this->m_coarse_data_for_bc = coarse_data_for_bc_save;
+    } else {
+        std::swap(m_bndry_sol, m_bndry_sol_zero);
+    }
+}
+
+template <typename MF>
+void
+MLCellLinOpT<MF>::endPrecondBC ()
+{
+    this->m_precond_mode = false;
+    std::swap(m_bndry_sol, m_bndry_sol_zero);
+}
+
 extern template class MLCellLinOpT<MultiFab>;
 
 using MLCellLinOp = MLCellLinOpT<MultiFab>;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_F.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_F.cpp
index f77cd8de900..940bfa045a2 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_F.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap_F.cpp
@@ -33,7 +33,7 @@ MLEBABecLap::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) c
     const auto *const  ccent = (factory) ? &(factory->getCentroid()) : nullptr;
 
     const bool is_eb_dirichlet =  isEBDirichlet();
-    const bool is_eb_inhomog = m_is_eb_inhomog;
+    const bool is_eb_inhomog = m_is_eb_inhomog && (!this->m_precond_mode);
 
     const int ncomp = getNComp();
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
index 6ebbd2c65e7..89bc763cea5 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
@@ -76,7 +76,7 @@ public:
 
     [[nodiscard]] std::unique_ptr<FabFactory<FArrayBox> > makeFactory (int amrlev, int mglev) const final;
 
-    void scaleRHS (int amrlev, MultiFab& rhs) const final;
+    [[nodiscard]] bool scaleRHS (int amrlev, MultiFab* rhs) const final;
 
 #endif
 
@@ -117,7 +117,7 @@ public:
                   Array4<Real const> const& bfab) const override;
 #endif
 
-    void postSolve (Vector<MultiFab>& sol) const override;
+    void postSolve (Vector<MultiFab*> const& sol) const override;
 
 private:
     GpuArray<Real,AMREX_SPACEDIM> m_sigma{{AMREX_D_DECL(1_rt,1_rt,1_rt)}};
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index b5a69e455a1..61580e4d368 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -363,34 +363,39 @@ MLEBNodeFDLaplacian::prepareForSolve ()
 }
 
 #ifdef AMREX_USE_EB
-void
-MLEBNodeFDLaplacian::scaleRHS (int amrlev, MultiFab& rhs) const
+bool
+MLEBNodeFDLaplacian::scaleRHS (int amrlev, MultiFab* rhs) const
 {
     const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
-    if (!factory) { return; }
 
-    auto const& dmask = *m_dirichlet_mask[amrlev][0];
-    auto const& edgecent = factory->getEdgeCent();
+    if (!factory) {return false; }
+
+    if (rhs) {
+        auto const& dmask = *m_dirichlet_mask[amrlev][0];
+        auto const& edgecent = factory->getEdgeCent();
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for (MFIter mfi(rhs,TilingIfNotGPU()); mfi.isValid(); ++mfi)
-    {
-        const Box& box = mfi.tilebox();
-        Array4<Real> const& rhsarr = rhs.array(mfi);
-        Array4<int const> const& dmarr = dmask.const_array(mfi);
-        bool cutfab = edgecent[0]->ok(mfi);
-        if (cutfab) {
-            AMREX_D_TERM(Array4<Real const> const& ecx = edgecent[0]->const_array(mfi);,
-                         Array4<Real const> const& ecy = edgecent[1]->const_array(mfi);,
-                         Array4<Real const> const& ecz = edgecent[2]->const_array(mfi));
-            AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
-            {
-                mlebndfdlap_scale_rhs(i,j,k,rhsarr,dmarr,AMREX_D_DECL(ecx,ecy,ecz));
-            });
+        for (MFIter mfi(*rhs,TilingIfNotGPU()); mfi.isValid(); ++mfi)
+        {
+            const Box& box = mfi.tilebox();
+            Array4<Real> const& rhsarr = rhs->array(mfi);
+            Array4<int const> const& dmarr = dmask.const_array(mfi);
+            bool cutfab = edgecent[0]->ok(mfi);
+            if (cutfab) {
+                AMREX_D_TERM(Array4<Real const> const& ecx = edgecent[0]->const_array(mfi);,
+                             Array4<Real const> const& ecy = edgecent[1]->const_array(mfi);,
+                             Array4<Real const> const& ecz = edgecent[2]->const_array(mfi));
+                AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
+                {
+                    mlebndfdlap_scale_rhs(i,j,k,rhsarr,dmarr,AMREX_D_DECL(ecx,ecy,ecz));
+                });
+            }
         }
     }
+
+    return true;
 }
 #endif
 
@@ -414,7 +419,7 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
     auto const& dmask = *m_dirichlet_mask[amrlev][mglev];
 
 #ifdef AMREX_USE_EB
-    const auto phieb = (m_in_solution_mode) ? m_s_phi_eb : Real(0.0);
+    const auto phieb = (m_in_solution_mode && !this->m_precond_mode) ? m_s_phi_eb : Real(0.0);
     const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
     Array<const MultiCutFab*,AMREX_SPACEDIM> edgecent {AMREX_D_DECL(nullptr,nullptr,nullptr)};
     if (factory) {
@@ -751,16 +756,17 @@ MLEBNodeFDLaplacian::fillRHS (MFIter const& /*mfi*/, Array4<int const> const& /*
 #endif
 
 void
-MLEBNodeFDLaplacian::postSolve (Vector<MultiFab>& sol) const
+MLEBNodeFDLaplacian::postSolve (Vector<MultiFab*> const& sol) const
 {
 #ifdef AMREX_USE_EB
+    if (this->m_precond_mode) { return; }
     for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
         const auto phieb = m_s_phi_eb;
         const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
         if (!factory) { return; }
         auto const& levset_mf = factory->getLevelSet();
         auto const& levset_ar = levset_mf.const_arrays();
-        MultiFab& mf = sol[amrlev];
+        MultiFab& mf = *sol[amrlev];
         auto const& sol_ar = mf.arrays();
         if (phieb == std::numeric_limits<Real>::lowest()) {
             auto const& phieb_ar = m_phi_eb[amrlev].const_arrays();
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
index 3fd1b26cae4..894a9657f24 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
@@ -269,7 +269,7 @@ MLEBTensorOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode
 
             Array4<Real const> foo;
             const bool is_eb_dirichlet =  isEBDirichlet();
-            const bool is_eb_inhomog = m_is_eb_inhomog;
+            const bool is_eb_inhomog = m_is_eb_inhomog && !this->m_precond_mode;
             Array4<Real const> const& velbfab = (is_eb_dirichlet && is_eb_inhomog)
                 ? m_eb_phi[amrlev]->const_array(mfi) : foo;
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index ce6a8b53335..08fa588b4a0 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -455,7 +455,9 @@ public:
     virtual void applyOverset (int /*amlev*/, MF& /*rhs*/) const {}
 
     //! scale RHS to fix solvability
-    virtual void scaleRHS (int /*amrlev*/, MF& /*rhs*/) const {}
+    [[nodiscard]] virtual bool scaleRHS (int /*amrlev*/, MF* /*rhs*/) const {
+        return false;
+    }
 
     //! get offset for fixing solvability
     virtual Vector<RT> getSolvabilityOffset (int /*amrlev*/, int /*mglev*/,
@@ -467,7 +469,7 @@ public:
 
     virtual void prepareForSolve () = 0;
 
-    virtual void prepareForGMRES () {}
+    virtual void preparePrecond () {}
 
     //! For GMRES to work, this might need to be implemented to mask out
     //! Dirichlet nodes or cells (e.g., EB covered cells or overset cells)
@@ -485,6 +487,10 @@ public:
     //! x dot y, used by the bottom solver
     virtual RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const = 0;
 
+    virtual RT dotProductPrecond (Vector<MF const*> const& x, Vector<MF const*> const& y) const;
+
+    virtual RT norm2Precond (Vector<MF const*> const& x) const;
+
     virtual std::unique_ptr<MLLinOpT<MF>> makeNLinOp (int /*grid_size*/) const
     {
         amrex::Abort("MLLinOp::makeNLinOp: N-Solve not supported");
@@ -533,7 +539,7 @@ public:
 
     virtual void copyNSolveSolution (MF&, MF const&) const {}
 
-    virtual void postSolve (Vector<MF>& /*sol*/) const {}
+    virtual void postSolve (Vector<MF*> const& /*sol*/) const {}
 
     [[nodiscard]] virtual RT normInf (int amrlev, MF const& mf, bool local) const = 0;
 
@@ -548,6 +554,9 @@ public:
     // This function is needed for FMG cycle, but not V-cycle.
     virtual void avgDownResMG (int clev, MF& cres, MF const& fres) const;
 
+    virtual void beginPrecondBC () { m_precond_mode = true; }
+    virtual void endPrecondBC () { m_precond_mode = false; }
+
     [[nodiscard]] bool isMFIterSafe (int amrlev, int mglev1, int mglev2) const;
 
     //! Return the number of AMR levels
@@ -628,6 +637,8 @@ protected:
     const MF* m_coarse_data_for_bc = nullptr;
     MF m_coarse_data_for_bc_raii;
 
+    bool m_precond_mode = false;
+
     //! Return AMR refinement ratios
     [[nodiscard]] const Vector<int>& AMRRefRatio () const noexcept { return m_amr_ref_ratio; }
 
@@ -1643,6 +1654,23 @@ MLLinOpT<MF>::setLevelBC (int amrlev, const AMF* levelbcdata,
                      robin_b_raii[amrlev].get(), robin_f_raii[amrlev].get());
 }
 
+template <typename MF>
+auto
+MLLinOpT<MF>::dotProductPrecond (Vector<MF const*> const& x, Vector<MF const*> const& y) const -> RT
+{
+    AMREX_ALWAYS_ASSERT(NAMRLevels() == 1);
+    return xdoty(0,0,*x[0],*y[0],false);
+}
+
+template <typename MF>
+auto
+MLLinOpT<MF>::norm2Precond (Vector<MF const*> const& x) const -> RT
+{
+    AMREX_ALWAYS_ASSERT(NAMRLevels() == 1);
+    auto r = xdoty(0,0,*x[0],*x[0],false);
+    return std::sqrt(r);
+}
+
 extern template class MLLinOpT<MultiFab>;
 
 using MLLinOp = MLLinOpT<MultiFab>;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 78b2ffdd3df..4b666f2c03d 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -50,6 +50,9 @@ public:
               std::initializer_list<AMF const*> a_rhs,
               RT a_tol_rel, RT a_tol_abs, const char* checkpoint_file = nullptr);
 
+    RT precond (Vector<MF*> const& a_sol, Vector<MF const*> const& a_rhs,
+                RT a_tol_rel, RT a_tol_abs);
+
     template <typename AMF>
     void getGradSolution (const Vector<Array<AMF*,AMREX_SPACEDIM> >& a_grad_sol,
                           Location a_loc = Location::FaceCenter);
@@ -113,11 +116,21 @@ public:
     */
     void apply (const Vector<MF*>& out, const Vector<MF*>& in);
 
+    //! out = L(in) as a preconditioner
+    void applyPrecond (const Vector<MF*>& out, const Vector<MF*>& in);
+
+    [[nodiscard]] int getVerbose () const { return verbose; }
+    [[nodiscard]] int getBottomVerbose () const { return bottom_verbose; }
+
+    void incPrintIdentation ();
+    void decPrintIdentation ();
+
     void setThrowException (bool t) noexcept { throw_exception = t; }
     void setVerbose (int v) noexcept { verbose = v; }
     void setMaxIter (int n) noexcept { max_iters = n; }
     void setMaxFmgIter (int n) noexcept { max_fmg_iters = n; }
     void setFixedIter (int nit) noexcept { do_fixed_number_of_iters = nit; }
+    void setPrecondIter (int nit) noexcept { max_precond_iters = nit; }
 
     void setPreSmooth (int n) noexcept { nu1 = n; }
     void setPostSmooth (int n) noexcept { nu2 = n; }
@@ -125,12 +138,13 @@ public:
     void setBottomSmooth (int n) noexcept { nub = n; }
 
     void setBottomSolver (BottomSolver s) noexcept { bottom_solver = s; }
+    [[nodiscard]] BottomSolver getBottomSolver () const noexcept { return bottom_solver; }
     void setCFStrategy (CFStrategy a_cf_strategy) noexcept {cf_strategy = a_cf_strategy;}
     void setBottomVerbose (int v) noexcept { bottom_verbose = v; }
     void setBottomMaxIter (int n) noexcept { bottom_maxiter = n; }
     void setBottomTolerance (RT t) noexcept { bottom_reltol = t; }
     void setBottomToleranceAbs (RT t) noexcept { bottom_abstol = t;}
-    RT getBottomToleranceAbs () noexcept{ return bottom_abstol; }
+    [[nodiscard]] RT getBottomToleranceAbs () const noexcept{ return bottom_abstol; }
 
     void setAlwaysUseBNorm (int flag) noexcept { always_use_bnorm = flag; }
 
@@ -173,9 +187,7 @@ public:
 
     void prepareLinOp ();
 
-    void prepareMGcycle ();
-
-    void prepareForGMRES ();
+    void preparePrecond ();
 
     void oneIter (int iter);
 
@@ -231,11 +243,13 @@ public:
 
 private:
 
+    bool precond_mode = false;
     bool throw_exception = false;
     int verbose = 1;
 
     int max_iters = 200;
     int do_fixed_number_of_iters = 0;
+    int max_precond_iters = 1;
 
     int nu1 = 2;       //!< pre
     int nu2 = 2;       //!< post
@@ -271,6 +285,8 @@ private:
     std::unique_ptr<MF> ns_sol;
     std::unique_ptr<MF> ns_rhs;
 
+    std::string print_ident;
+
     //! Hypre
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
     // Hypre::Interface hypre_interface = Hypre::Interface::structed;
@@ -400,8 +416,8 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
 
         if (verbose >= 1)
         {
-            amrex::Print() << "MLMG: Initial rhs               = " << rhsnorm0 << "\n"
-                           << "MLMG: Initial residual (resid0) = " << resnorm0 << "\n";
+            amrex::Print() << print_ident << "MLMG: Initial rhs               = " << rhsnorm0 << "\n"
+                           << print_ident << "MLMG: Initial residual (resid0) = " << resnorm0 << "\n";
         }
     }
 
@@ -422,7 +438,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
     if (!is_nsolve && resnorm0 <= res_target) {
         composite_norminf = resnorm0;
         if (verbose >= 1) {
-            amrex::Print() << "MLMG: No iterations needed\n";
+            amrex::Print() << print_ident << "MLMG: No iterations needed\n";
         }
     } else {
         auto iter_start_time = amrex::second();
@@ -444,7 +460,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
             m_iter_fine_resnorm0.push_back(fine_norminf);
             composite_norminf = fine_norminf;
             if (verbose >= 2) {
-                amrex::Print() << "MLMG: Iteration " << std::setw(3) << iter+1 << " Fine resid/"
+                amrex::Print() << print_ident << "MLMG: Iteration " << std::setw(3) << iter+1 << " Fine resid/"
                                << norm_name << " = " << fine_norminf/max_norm << "\n";
             }
             bool fine_converged = (fine_norminf <= res_target);
@@ -456,7 +472,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
                 computeMLResidual(finest_amr_lev-1);
                 RT crse_norminf = MLResNormInf(finest_amr_lev-1);
                 if (verbose >= 2) {
-                    amrex::Print() << "MLMG: Iteration " << std::setw(3) << iter+1
+                    amrex::Print() << print_ident << "MLMG: Iteration " << std::setw(3) << iter+1
                                    << " Crse resid/" << norm_name << " = "
                                    << crse_norminf/max_norm << "\n";
                 }
@@ -468,7 +484,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
 
             if (converged) {
                 if (verbose >= 1) {
-                    amrex::Print() << "MLMG: Final Iter. " << iter+1
+                    amrex::Print() << print_ident << "MLMG: Final Iter. " << iter+1
                                    << " resid, resid/" << norm_name << " = "
                                    << composite_norminf << ", "
                                    << composite_norminf/max_norm << "\n";
@@ -478,7 +494,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
                 if (composite_norminf > RT(1.e20)*max_norm)
                 {
                     if (verbose > 0) {
-                        amrex::Print() << "MLMG: Failing to converge after " << iter+1 << " iterations."
+                        amrex::Print() << print_ident << "MLMG: Failing to converge after " << iter+1 << " iterations."
                                        << " resid, resid/" << norm_name << " = "
                                        << composite_norminf << ", "
                                        << composite_norminf/max_norm << "\n";
@@ -495,7 +511,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
 
         if (!converged && do_fixed_number_of_iters == 0) {
             if (verbose > 0) {
-                amrex::Print() << "MLMG: Failed to converge after " << max_iters << " iterations."
+                amrex::Print() << print_ident << "MLMG: Failed to converge after " << max_iters << " iterations."
                                << " resid, resid/" << norm_name << " = "
                                << composite_norminf << ", "
                                << composite_norminf/max_norm << "\n";
@@ -510,7 +526,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
         timer[iter_time] = amrex::second() - iter_start_time;
     }
 
-    linop.postSolve(sol);
+    linop.postSolve(GetVecOfPtrs(sol));
 
     IntVect ng_back = final_fill_bc ? IntVect(1) : IntVect(0);
     if (linop.hasHiddenDimension()) {
@@ -529,7 +545,7 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
                                     ParallelContext::CommunicatorSub());
         if (ParallelContext::MyProcSub() == 0)
         {
-            amrex::AllPrint() << "MLMG: Timers: Solve = " << timer[solve_time]
+            amrex::AllPrint() << print_ident << "MLMG: Timers: Solve = " << timer[solve_time]
                               << " Iter = " << timer[iter_time]
                               << " Bottom = " << timer[bottom_time] << "\n";
         }
@@ -540,6 +556,24 @@ MLMGT<MF>::solve (const Vector<AMF*>& a_sol, const Vector<AMF const*>& a_rhs,
     return composite_norminf;
 }
 
+template <typename MF>
+auto
+MLMGT<MF>::precond (const Vector<MF*>& a_sol, const Vector<MF const*>& a_rhs,
+                    RT a_tol_rel, RT a_tol_abs) -> RT
+{
+    precond_mode = true;
+    std::swap(max_precond_iters, do_fixed_number_of_iters);
+    linop.beginPrecondBC();
+
+    auto r = solve(a_sol, a_rhs, a_tol_rel, a_tol_abs);
+
+    linop.endPrecondBC();
+    std::swap(max_precond_iters, do_fixed_number_of_iters);
+    precond_mode = false;
+
+    return r;
+}
+
 template <typename MF>
 void
 MLMGT<MF>::prepareForFluxes (Vector<MF const*> const& a_sol)
@@ -931,6 +965,17 @@ MLMGT<MF>::apply (const Vector<MF*>& out, const Vector<MF*>& a_in)
     }
 }
 
+template <typename MF>
+void
+MLMGT<MF>::applyPrecond (const Vector<MF*>& out, const Vector<MF*>& in)
+{
+    precond_mode = true;
+    linop.beginPrecondBC();
+    apply(out, in);
+    linop.endPrecondBC();
+    precond_mode = false;
+}
+
 template <typename MF>
 template <typename AMF>
 void
@@ -1008,7 +1053,10 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
         linop.unimposeNeumannBC(alev, rhs[alev]);
         linop.applyInhomogNeumannTerm(alev, rhs[alev]);
         linop.applyOverset(alev, rhs[alev]);
-        linop.scaleRHS(alev, rhs[alev]);
+        if ( ! precond_mode) {
+            bool r = linop.scaleRHS(alev, &(rhs[alev]));
+            amrex::ignore_unused(r);
+        }
 
 #ifdef AMREX_USE_EB
         const auto *factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(alev));
@@ -1105,12 +1153,12 @@ MLMGT<MF>::prepareForSolve (Vector<AMF*> const& a_sol, Vector<AMF const*> const&
     }
 
     if (verbose >= 2) {
-        amrex::Print() << "MLMG: # of AMR levels: " << namrlevs << "\n"
-                       << "      # of MG levels on the coarsest AMR level: " << linop.NMGLevels(0)
+        amrex::Print() << print_ident << "MLMG: # of AMR levels: " << namrlevs << "\n"
+                       << print_ident << "      # of MG levels on the coarsest AMR level: " << linop.NMGLevels(0)
                        << "\n";
         if (ns_linop) {
-            amrex::Print() << "      # of MG levels in N-Solve: " << ns_linop->NMGLevels(0) << "\n"
-                           << "      # of grids in N-Solve: " << ns_linop->m_grids[0][0].size() << "\n";
+            amrex::Print() << print_ident << "      # of MG levels in N-Solve: " << ns_linop->NMGLevels(0) << "\n"
+                           << print_ident << "      # of grids in N-Solve: " << ns_linop->m_grids[0][0].size() << "\n";
         }
     }
 }
@@ -1129,65 +1177,10 @@ MLMGT<MF>::prepareLinOp ()
 
 template <typename MF>
 void
-MLMGT<MF>::prepareForGMRES ()
+MLMGT<MF>::preparePrecond ()
 {
     prepareLinOp();
-    linop.prepareForGMRES();
-}
-
-template <typename MF>
-void
-MLMGT<MF>::prepareMGcycle ()
-{
-    if (res.empty()) {
-        IntVect ng = linop.getNGrowVectRestriction();
-        linop.make(res, ng);
-        linop.make(rescor, ng);
-
-        for (int alev = 0; alev <= finest_amr_lev; ++alev)
-        {
-            const int nmglevs = linop.NMGLevels(alev);
-            for (int mglev = 0; mglev < nmglevs; ++mglev)
-            {
-                setVal(res   [alev][mglev], RT(0.0));
-                setVal(rescor[alev][mglev], RT(0.0));
-            }
-        }
-
-        IntVect ng_sol(1);
-        if (linop.hasHiddenDimension()) { ng_sol[linop.hiddenDirection()] = 0; }
-        ng = ng_sol;
-
-        cor.resize(namrlevs);
-        for (int alev = 0; alev <= finest_amr_lev; ++alev)
-        {
-            const int nmglevs = linop.NMGLevels(alev);
-            cor[alev].resize(nmglevs);
-            for (int mglev = 0; mglev < nmglevs; ++mglev)
-            {
-                cor[alev][mglev] = linop.make(alev, mglev, ng);
-                setVal(cor[alev][mglev], RT(0.0));
-            }
-        }
-
-        cor_hold.resize(std::max(namrlevs-1,1));
-        {
-            const int alev = 0;
-            const int nmglevs = linop.NMGLevels(alev);
-            cor_hold[alev].resize(nmglevs);
-            for (int mglev = 0; mglev < nmglevs-1; ++mglev)
-            {
-                cor_hold[alev][mglev] = linop.make(alev, mglev, ng);
-                setVal(cor_hold[alev][mglev], RT(0.0));
-            }
-        }
-        for (int alev = 1; alev < finest_amr_lev; ++alev)
-        {
-            cor_hold[alev].resize(1);
-            cor_hold[alev][0] = linop.make(alev, 0, ng);
-            setVal(cor_hold[alev][0], RT(0.0));
-        }
-    }
+    linop.preparePrecond();
 }
 
 template <typename MF>
@@ -1318,7 +1311,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             RT norm = norminf(res[amrlev][mglev],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm before smooth " << norm << "\n";
         }
 
@@ -1335,7 +1328,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm after  smooth " << norm << "\n";
         }
 
@@ -1349,7 +1342,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   DN: Norm before bottom " << norm << "\n";
         }
         bottomSolve();
@@ -1357,7 +1350,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
             RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   UP: Norm after  bottom " << norm << "\n";
         }
     }
@@ -1366,7 +1359,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             RT norm = norminf(res[amrlev][mglev_bottom],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "       Norm before smooth " << norm << "\n";
         }
         setVal(cor[amrlev][mglev_bottom], RT(0.0));
@@ -1380,7 +1373,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
             RT norm = norminf(rescor[amrlev][mglev_bottom],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev  << " " << mglev_bottom
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev  << " " << mglev_bottom
                            << "       Norm after  smooth " << norm << "\n";
         }
     }
@@ -1395,7 +1388,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         {
             computeResOfCorrection(amrlev, mglev);
             RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm before smooth " << norm << "\n";
         }
         for (int i = 0; i < nu2; ++i) {
@@ -1408,7 +1401,7 @@ MLMGT<MF>::mgVcycle (int amrlev, int mglev_top)
         {
             computeResOfCorrection(amrlev, mglev);
             RT norm = norminf(rescor[amrlev][mglev],0,ncomp,IntVect(0));
-            amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
+            amrex::Print() << print_ident << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm after  smooth " << norm << "\n";
         }
     }
@@ -1616,13 +1609,14 @@ MLMGT<MF>::bottomSolveWithCG (MF& x, const MF& b, typename MLCGSolverT<MF>::Type
     MLCGSolverT<MF> cg_solver(linop);
     cg_solver.setSolver(type);
     cg_solver.setVerbose(bottom_verbose);
+    cg_solver.setPrintIdentation(print_ident);
     cg_solver.setMaxIter(bottom_maxiter);
     cg_solver.setInitSolnZeroed(true);
     if (cf_strategy == CFStrategy::ghostnodes) { cg_solver.setNGhost(linop.getNGrow()); }
 
     int ret = cg_solver.solve(x, b, bottom_reltol, bottom_abstol);
     if (ret != 0 && verbose > 1) {
-        amrex::Print() << "MLMG: Bottom solve failed.\n";
+        amrex::Print() << print_ident << "MLMG: Bottom solve failed.\n";
     }
     m_niters_cg.push_back(cg_solver.getNumIters());
     return ret;
@@ -1848,7 +1842,7 @@ MLMGT<MF>::makeSolvable ()
     auto const& offset = linop.getSolvabilityOffset(0, 0, rhs[0]);
     if (verbose >= 4) {
         for (int c = 0; c < ncomp; ++c) {
-            amrex::Print() << "MLMG: Subtracting " << offset[c] << " from rhs component "
+            amrex::Print() << print_ident << "MLMG: Subtracting " << offset[c] << " from rhs component "
                            << c << "\n";
         }
     }
@@ -1864,7 +1858,7 @@ MLMGT<MF>::makeSolvable (int amrlev, int mglev, MF& mf)
     auto const& offset = linop.getSolvabilityOffset(amrlev, mglev, mf);
     if (verbose >= 4) {
         for (int c = 0; c < ncomp; ++c) {
-            amrex::Print() << "MLMG: Subtracting " << offset[c]
+            amrex::Print() << print_ident << "MLMG: Subtracting " << offset[c]
                            << " from mf component c = " << c
                            << " on level (" << amrlev << ", " << mglev << ")\n";
         }
@@ -2031,6 +2025,24 @@ MLMGT<MF>::checkPoint (const Vector<MultiFab*>& a_sol,
     linop.checkPoint(file_name+"/linop");
 }
 
+template <typename MF>
+void
+MLMGT<MF>::incPrintIdentation ()
+{
+    print_ident.resize(print_ident.size()+4, ' ');
+}
+
+template <typename MF>
+void
+MLMGT<MF>::decPrintIdentation ()
+{
+    if (print_ident.size() > 4) {
+        print_ident.resize(print_ident.size()-4, ' ');
+    } else {
+        print_ident.clear();
+    }
+}
+
 extern template class MLMGT<MultiFab>;
 
 using MLMG = MLMGT<MultiFab>;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
index fdf1f08ae59..1ae0205f986 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_sync.cpp
@@ -674,9 +674,9 @@ MLNodeLaplacian::reflux (int crse_amrlev,
     MultiFab fine_res_for_coarse(amrex::coarsen(fba, amrrr), fdm, 1, 0);
 
     std::unique_ptr<MultiFab> tmp_fine_res;
-    if (amrrr == 4 && !a_fine_res.nGrowVect().allGE(3)) {
+    if (!a_fine_res.nGrowVect().allGE(amrrr-1)) {
         tmp_fine_res = std::make_unique<MultiFab>(a_fine_res.boxArray(),
-                                                  a_fine_res.DistributionMap(), 1, 3);
+                                                  a_fine_res.DistributionMap(), 1, amrrr-1);
         MultiFab::Copy(*tmp_fine_res, a_fine_res, 0, 0, 1, 0);
     }
     MultiFab& fine_res = (tmp_fine_res) ? *tmp_fine_res :  a_fine_res;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
index 9adb2f12408..a9c1a65b525 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
@@ -59,7 +59,7 @@ public:
 
     void prepareForSolve () override;
 
-    void prepareForGMRES () override;
+    void preparePrecond () override;
 
     void setDirichletNodesToZero (int amrlev, int mglev, MultiFab& mf) const override;
 
@@ -69,6 +69,11 @@ public:
 
     Real xdoty (int amrlev, int mglev, const MultiFab& x, const MultiFab& y, bool local) const final;
 
+    Real dotProductPrecond (Vector<MultiFab const*> const& x,
+                            Vector<MultiFab const*> const& y) const final;
+
+    Real norm2Precond (Vector<MultiFab const*> const& x) const final;
+
     virtual void applyBC (int amrlev, int mglev, MultiFab& phi, BCMode bc_mode, StateMode state_mode,
                           bool skip_fillboundary=false) const;
 
@@ -135,6 +140,7 @@ protected:
     Vector<std::unique_ptr<LayoutData<int> > > m_has_fine_bndry; // does this fab contain c/f boundary?
     MultiFab m_bottom_dot_mask;
     MultiFab m_coarse_dot_mask;
+    Vector<MultiFab> m_precond_weight_mask;
 
     Vector<std::unique_ptr<iMultiFab> > m_norm_fine_mask;
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
index 929d05dc5a1..9719ac3d3e8 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
@@ -183,19 +183,41 @@ MLNodeLinOp::xdoty (int amrlev, int mglev, const MultiFab& x, const MultiFab& y,
     AMREX_ASSERT(mglev+1==m_num_mg_levels[0] || mglev==0);
     const auto& mask = (mglev+1 == m_num_mg_levels[0]) ? m_bottom_dot_mask : m_coarse_dot_mask;
     const int ncomp = y.nComp();
-    const int nghost = 0;
-    MultiFab tmp(x.boxArray(), x.DistributionMap(), ncomp, 0);
-    MultiFab::Copy(tmp, x, 0, 0, ncomp, nghost);
-    for (int i = 0; i < ncomp; i++) {
-        MultiFab::Multiply(tmp, mask, 0, i, 1, nghost);
-    }
-    Real result = MultiFab::Dot(tmp,0,y,0,ncomp,nghost,true);
+    const IntVect nghost(0);
+    Real result = amrex::Dot(mask, x, 0, y, 0, ncomp, nghost, true);
     if (!local) {
         ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
     }
     return result;
 }
 
+Real
+MLNodeLinOp::dotProductPrecond (Vector<MultiFab const*> const& x,
+                                Vector<MultiFab const*> const& y) const
+{
+    Real result = 0;
+    const int ncomp = x[0]->nComp();
+    for (int ilev = 0; ilev < NAMRLevels(); ++ilev) {
+        result += amrex::Dot(m_precond_weight_mask[ilev],
+                             *x[ilev],0,*y[ilev],0,ncomp,IntVect(0),true);
+    }
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return result;
+}
+
+Real
+MLNodeLinOp::norm2Precond (Vector<MultiFab const*> const& x) const
+{
+    Real result = 0;
+    const int ncomp = x[0]->nComp();
+    for (int ilev = 0; ilev < NAMRLevels(); ++ilev) {
+        result += amrex::Dot(m_precond_weight_mask[ilev],
+                             *x[ilev],0,ncomp,IntVect(0),true);
+    }
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return std::sqrt(result);
+}
+
 Vector<Real>
 MLNodeLinOp::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) const
 {
@@ -384,17 +406,56 @@ MLNodeLinOp::buildMasks ()
 }
 
 void
-MLNodeLinOp::prepareForGMRES ()
+MLNodeLinOp::preparePrecond ()
 {
-    if (m_coarse_dot_mask.empty()) {
-        int amrlev = 0;
-        int mglev = 0;
-        const Geometry& geom = m_geom[amrlev][mglev];
-        const iMultiFab& omask = *m_owner_mask_top;
-        m_coarse_dot_mask.define(omask.boxArray(), omask.DistributionMap(), 1, 0);
-        const auto lobc = LoBC();
-        const auto hibc = HiBC();
-        MLNodeLinOp_set_dot_mask(m_coarse_dot_mask, omask, geom, lobc, hibc, m_coarsening_strategy);
+    if (m_precond_weight_mask.empty()) {
+        m_precond_weight_mask.resize(m_num_amr_levels);
+        for (int ilev = 0; ilev < m_num_amr_levels; ++ilev) {
+            m_precond_weight_mask[ilev].define(amrex::convert(m_grids[ilev][0],IntVect(1)),
+                                               m_dmap[ilev][0], 1, 0);
+            auto omask = makeOwnerMask(m_grids[ilev][0],
+                                       m_dmap[ilev][0],
+                                       m_geom[ilev][0]);
+            const auto lobc = LoBC();
+            const auto hibc = HiBC();
+            Box nddomain = amrex::surroundingNodes(m_geom[ilev][0].Domain());
+            if (m_coarsening_strategy != MLNodeLinOp::CoarseningStrategy::Sigma) {
+                nddomain.grow(1000); // hack to avoid masks being modified at Neumann boundary
+            }
+            if (ilev < m_num_amr_levels-1) {
+                auto const& fmask = *m_nd_fine_mask[ilev];
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+                for (MFIter mfi(m_precond_weight_mask[ilev],TilingIfNotGPU());
+                     mfi.isValid(); ++mfi)
+                {
+                    const Box& bx = mfi.tilebox();
+                    Array4<Real> const& dfab = m_precond_weight_mask[ilev].array(mfi);
+                    Array4<int const> const& sfab = omask->const_array(mfi);
+                    Array4<int const> const& ffab = fmask.const_array(mfi);
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    {
+                        mlndlap_set_dot_mask(tbx, dfab, sfab, ffab, nddomain, lobc, hibc);
+                    });
+                }
+            } else {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+                for (MFIter mfi(m_precond_weight_mask[ilev],TilingIfNotGPU());
+                     mfi.isValid(); ++mfi)
+                {
+                    const Box& bx = mfi.tilebox();
+                    Array4<Real> const& dfab = m_precond_weight_mask[ilev].array(mfi);
+                    Array4<int const> const& sfab = omask->const_array(mfi);
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( bx, tbx,
+                    {
+                        mlndlap_set_dot_mask(tbx, dfab, sfab, nddomain, lobc, hibc);
+                    });
+                }
+            }
+        }
     }
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H
index b842dd81b88..c6222dd4a8a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_1D_K.H
@@ -162,6 +162,41 @@ void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk,
+                           Array4<int const> const& fmsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = bx.smallEnd(0);
+    const auto hi = bx.bigEnd(0);
+
+    AMREX_PRAGMA_SIMD
+    for (int i = lo; i <= hi; ++i) {
+        if (fmsk(i,0,0) == 0) {
+            dmsk(i,0,0) = static_cast<Real>(omsk(i,0,0));
+        } else {
+            dmsk(i,0,0) = Real(0);
+        }
+    }
+
+    const auto domlo = dom.smallEnd(0);
+    const auto domhi = dom.bigEnd(0);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo == domlo)
+    {
+        dmsk(lo,0,0) *= Real(0.5);
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi == domhi)
+    {
+        dmsk(hi,0,0) *= Real(0.5);
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H
index 3d8746cf058..2646f48057c 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_2D_K.H
@@ -312,6 +312,63 @@ void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk,
+                           Array4<int const> const& fmsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        if (fmsk(i,j,0) == 0) {
+            dmsk(i,j,0) = static_cast<Real>(omsk(i,j,0));
+        } else {
+            dmsk(i,j,0) = Real(0);
+        }
+    }}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo.x == domlo.x)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,0) *= Real(0.5);
+        }
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi.x == domhi.x)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,0) *= Real(0.5);
+        }
+    }
+
+    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
+        && lo.y == domlo.y)
+    {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,0) *= Real(0.5);
+        }
+    }
+
+    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
+        && hi.y == domhi.y)
+    {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,0) *= Real(0.5);
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H
index 976a16c7aa5..bf9c14cb491 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp_3D_K.H
@@ -805,6 +805,88 @@ void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlndlap_set_dot_mask (Box const& bx, Array4<Real> const& dmsk,
+                           Array4<int const> const& omsk,
+                           Array4<int const> const& fmsk, Box const& dom,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bclo,
+                           GpuArray<LinOpBCType, AMREX_SPACEDIM> const& bchi) noexcept
+{
+    const auto lo = amrex::lbound(bx);
+    const auto hi = amrex::ubound(bx);
+    for (int k = lo.z; k <= hi.z; ++k) {
+    for (int j = lo.y; j <= hi.y; ++j) {
+    AMREX_PRAGMA_SIMD
+    for (int i = lo.x; i <= hi.x; ++i) {
+        if (fmsk(i,j,k) == 0) {
+            dmsk(i,j,k) = static_cast<Real>(omsk(i,j,k));
+        } else {
+            dmsk(i,j,k) = Real(0);
+        }
+    }}}
+
+    const auto domlo = amrex::lbound(dom);
+    const auto domhi = amrex::ubound(dom);
+
+    if ((bclo[0] == LinOpBCType::Neumann || bclo[0] == LinOpBCType::inflow)
+        && lo.x == domlo.x)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(lo.x,j,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[0] == LinOpBCType::Neumann || bchi[0] == LinOpBCType::inflow)
+        && hi.x == domhi.x)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        for (int j = lo.y; j <= hi.y; ++j) {
+            dmsk(hi.x,j,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bclo[1] == LinOpBCType::Neumann || bclo[1] == LinOpBCType::inflow)
+        && lo.y == domlo.y)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,lo.y,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[1] == LinOpBCType::Neumann || bchi[1] == LinOpBCType::inflow)
+        && hi.y == domhi.y)
+    {
+        for (int k = lo.z; k <= hi.z; ++k) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,hi.y,k) *= Real(0.5);
+        }}
+    }
+
+    if ((bclo[2] == LinOpBCType::Neumann || bclo[2] == LinOpBCType::inflow)
+        && lo.z == domlo.z)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,lo.z) *= Real(0.5);
+        }}
+    }
+
+    if ((bchi[2] == LinOpBCType::Neumann || bchi[2] == LinOpBCType::inflow)
+        && hi.z == domhi.z)
+    {
+        for (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            dmsk(i,j,hi.z) *= Real(0.5);
+        }}
+    }
+}
+
 }
 
 #endif
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
index 7a7647ce93a..27f6d439d90 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
@@ -477,12 +477,9 @@ MyTest::solveABecLaplacianGMRES ()
 
     const auto nlevels = static_cast<int>(geom.size());
 
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(composite_solve == false || nlevels == 1,
-       "solveABecLaplacianGMRES does not support composite solve");
-
-    for (int ilev = 0; ilev < nlevels; ++ilev)
+    if (composite_solve)
     {
-        MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
+        MLABecLaplacian mlabec(geom, grids, dmap, info);
 
         mlabec.setMaxOrder(linop_maxorder);
 
@@ -493,40 +490,99 @@ MyTest::solveABecLaplacianGMRES ()
                                          LinOpBCType::Dirichlet,
                                          LinOpBCType::Neumann)});
 
-        if (ilev > 0) {
-            mlabec.setCoarseFineBC(&solution[ilev-1], ref_ratio);
+        for (int ilev = 0; ilev < nlevels; ++ilev)
+        {
+            mlabec.setLevelBC(ilev, &solution[ilev]);
         }
 
-        // for problem with pure homogeneous Neumann BC, we could pass a nullptr
-        mlabec.setLevelBC(0, &solution[ilev]);
-
         mlabec.setScalars(ascalar, bscalar);
 
-        mlabec.setACoeffs(0, acoef[ilev]);
-
-        Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
-        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+        for (int ilev = 0; ilev < nlevels; ++ilev)
         {
-            const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
-                                                IntVect::TheDimensionVector(idim));
-            face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            mlabec.setACoeffs(ilev, acoef[ilev]);
+
+            Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+            {
+                const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                    IntVect::TheDimensionVector(idim));
+                face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            }
+            amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
+                                              bcoef[ilev], geom[ilev]);
+            mlabec.setBCoeffs(ilev, amrex::GetArrOfConstPtrs(face_bcoef));
         }
-        amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
-                                          bcoef[ilev], geom[ilev]);
-        mlabec.setBCoeffs(0, amrex::GetArrOfConstPtrs(face_bcoef));
 
         MLMG mlmg(mlabec);
-        GMRESMLMG gmsolver(mlmg);
+        GMRESMLMGT<MultiFab> gmsolver(mlmg);
         gmsolver.usePrecond(true);
         gmsolver.setVerbose(verbose);
-        gmsolver.solve(solution[ilev], rhs[ilev], tol_rel, tol_abs);
+        gmsolver.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
 
         if (verbose) {
-            MultiFab res(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
-            mlmg.apply({&res}, {&solution[ilev]}); // res = L(sol)
-            MultiFab::Subtract(res, rhs[ilev], 0, 0, 1, 0); // now res = L(sol) - rhs
-            amrex::Print() << "Final residual = " << res.norminf(0)
-                           << " " << res.norm1(0) << " " << res.norm2(0) << '\n';
+            Vector<MultiFab> res(nlevels);
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                res[ilev].define(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+            }
+            mlmg.apply(GetVecOfPtrs(res), GetVecOfPtrs(solution)); // res = L(sol)
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                MultiFab::Subtract(res[ilev], rhs[ilev], 0, 0, 1, 0);
+                amrex::Print() << "Final residual at level " << ilev << " = "
+                               << res[ilev].norminf(0) << " " << res[ilev].norm1(0) << " "
+                               << res[ilev].norm2(0) << '\n';
+            }
+        }
+    }
+    else
+    {
+        for (int ilev = 0; ilev < nlevels; ++ilev)
+        {
+            MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
+
+            mlabec.setMaxOrder(linop_maxorder);
+
+            mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                             LinOpBCType::Neumann,
+                                             LinOpBCType::Neumann)},
+                               {AMREX_D_DECL(LinOpBCType::Neumann,
+                                             LinOpBCType::Dirichlet,
+                                             LinOpBCType::Neumann)});
+
+            if (ilev > 0) {
+                mlabec.setCoarseFineBC(&solution[ilev-1], ref_ratio);
+            }
+
+            // for problem with pure homogeneous Neumann BC, we could pass a nullptr
+            mlabec.setLevelBC(0, &solution[ilev]);
+
+            mlabec.setScalars(ascalar, bscalar);
+
+            mlabec.setACoeffs(0, acoef[ilev]);
+
+            Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+            {
+                const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                    IntVect::TheDimensionVector(idim));
+                face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            }
+            amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
+                                              bcoef[ilev], geom[ilev]);
+            mlabec.setBCoeffs(0, amrex::GetArrOfConstPtrs(face_bcoef));
+
+            MLMG mlmg(mlabec);
+            GMRESMLMGT gmsolver(mlmg);
+            gmsolver.usePrecond(true);
+            gmsolver.setVerbose(verbose);
+            gmsolver.solve(solution[ilev], rhs[ilev], tol_rel, tol_abs);
+
+            if (verbose) {
+                MultiFab res(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+                mlmg.apply({&res}, {&solution[ilev]}); // res = L(sol)
+                MultiFab::Subtract(res, rhs[ilev], 0, 0, 1, 0); // now res = L(sol) - rhs
+                amrex::Print() << "Final residual = " << res.norminf(0)
+                               << " " << res.norm1(0) << " " << res.norm2(0) << '\n';
+            }
         }
     }
 }
diff --git a/Tests/LinearSolvers/CellEB/GNUmakefile b/Tests/LinearSolvers/CellEB/GNUmakefile
index 45c97ff1584..83dd8799e0b 100644
--- a/Tests/LinearSolvers/CellEB/GNUmakefile
+++ b/Tests/LinearSolvers/CellEB/GNUmakefile
@@ -22,7 +22,7 @@ include ./Make.package
 
 Pdirs := Base Boundary AmrCore
 Pdirs += EB
-Pdirs += LinearSolvers/MLMG
+Pdirs += LinearSolvers
 
 Ppack	+= $(foreach dir, $(Pdirs), $(AMREX_HOME)/Src/$(dir)/Make.package)
 
diff --git a/Tests/LinearSolvers/CellEB/MyTest.H b/Tests/LinearSolvers/CellEB/MyTest.H
index 4af902ddec3..de06b39db79 100644
--- a/Tests/LinearSolvers/CellEB/MyTest.H
+++ b/Tests/LinearSolvers/CellEB/MyTest.H
@@ -44,6 +44,9 @@ private:
     int max_coarsening_level = 30;
     bool use_hypre = false;
     bool use_petsc = false;
+
+    bool use_gmres = false;
+
     amrex::Vector<amrex::Geometry> geom;
     amrex::Vector<amrex::BoxArray> grids;
     amrex::Vector<amrex::DistributionMapping> dmap;
diff --git a/Tests/LinearSolvers/CellEB/MyTest.cpp b/Tests/LinearSolvers/CellEB/MyTest.cpp
index 8f8c000997e..b895c09c986 100644
--- a/Tests/LinearSolvers/CellEB/MyTest.cpp
+++ b/Tests/LinearSolvers/CellEB/MyTest.cpp
@@ -6,6 +6,7 @@
 #include <AMReX_EBMultiFabUtil.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_EB2.H>
+#include <AMReX_GMRES_MLMG.H>
 
 #include <cmath>
 
@@ -85,7 +86,43 @@ MyTest::solve ()
     }
     const Real tol_rel = reltol;
     const Real tol_abs = 0.0;
-    mlmg.solve(amrex::GetVecOfPtrs(phi), amrex::GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
+
+    if (verbose) {
+        Vector<MultiFab> res(max_level+1);
+        for (int ilev = 0; ilev <= max_level; ++ilev) {
+            res[ilev].define(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+        }
+        mlmg.apply(GetVecOfPtrs(res), GetVecOfPtrs(phi)); // res = L(sol)
+        for (int ilev = 0; ilev <= max_level; ++ilev) {
+            MultiFab::Subtract(res[ilev], rhs[ilev], 0, 0, 1, 0);
+            amrex::Print() << "Initial max, 1 and 2-norm residuals at level " << ilev << " = "
+                           << res[ilev].norminf(0) << " " << res[ilev].norm1(0) << " "
+                           << res[ilev].norm2(0) << '\n';
+        }
+    }
+
+    if (use_gmres) {
+        GMRESMLMGT<MultiFab> gmsolver(mlmg);
+        gmsolver.usePrecond(true);
+        gmsolver.setVerbose(verbose);
+        gmsolver.solve(GetVecOfPtrs(phi), GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
+    } else {
+        mlmg.solve(amrex::GetVecOfPtrs(phi), amrex::GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
+    }
+
+    if (verbose) {
+        Vector<MultiFab> res(max_level+1);
+        for (int ilev = 0; ilev <= max_level; ++ilev) {
+            res[ilev].define(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+        }
+        mlmg.apply(GetVecOfPtrs(res), GetVecOfPtrs(phi)); // res = L(sol)
+        for (int ilev = 0; ilev <= max_level; ++ilev) {
+            MultiFab::Subtract(res[ilev], rhs[ilev], 0, 0, 1, 0);
+            amrex::Print() << "Final max, 1 and 2-norm residuals at level " << ilev << " = "
+                           << res[ilev].norminf(0) << " " << res[ilev].norm1(0) << " "
+                           << res[ilev].norm2(0) << '\n';
+        }
+    }
 }
 
 void
@@ -143,6 +180,8 @@ MyTest::readParameters ()
 #ifdef AMREX_USE_PETSC
     pp.query("use_petsc",use_petsc);
 #endif
+
+    pp.query("use_gmres", use_gmres);
 }
 
 void
diff --git a/Tests/LinearSolvers/CurlCurl/MyTest.cpp b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
index f8a84ca5051..ea7e23bd68e 100644
--- a/Tests/LinearSolvers/CurlCurl/MyTest.cpp
+++ b/Tests/LinearSolvers/CurlCurl/MyTest.cpp
@@ -65,11 +65,6 @@ MyTest::solve ()
         GMRESMLMGT<V> gmsolver(mlmg);
         gmsolver.usePrecond(gmres_use_precond);
         gmsolver.setPrecondNumIters(gmres_precond_niters);
-
-        // This system has homogeneous BC unlike
-        // Tests/LinearSolvers/ABecLaplacian_C, so the setup is simpler.
-        gmsolver.setPropertyOfZero(true);
-
         gmsolver.setVerbose(verbose);
         gmsolver.solve(solution, rhs, tol_rel, tol_abs);
     }
diff --git a/Tests/LinearSolvers/NodalPoisson/MyTest.cpp b/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
index 39c781d8795..af0248e7e05 100644
--- a/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
+++ b/Tests/LinearSolvers/NodalPoisson/MyTest.cpp
@@ -59,7 +59,13 @@ MyTest::solve ()
             solution[ilev].setVal(0.0, interior, 0, 1, 0);
         }
 
-        mlmg.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs), reltol, 0.0);
+        if (use_gmres) {
+            GMRESMLMG gmsolver(mlmg);
+            gmsolver.setVerbose(verbose);
+            gmsolver.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs), reltol, 0.0);
+        } else {
+            mlmg.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs), reltol, 0.0);
+        }
     }
     else // solve level by level
     {
@@ -154,9 +160,6 @@ MyTest::readParameters ()
 
     pp.query("composite_solve", composite_solve);
     pp.query("use_gmres", use_gmres);
-    if (use_gmres) {
-        composite_solve = false;
-    }
 
     pp.query("verbose", verbose);
     pp.query("bottom_verbose", bottom_verbose);