GMRES is now working with FFT preconditioner

erf-model · Nov 24, 2024 · cf9593a · cf9593a
1 parent ceb0eef
commit cf9593a
Show file tree

Hide file tree

Showing 9 changed files with 367 additions and 27 deletions.
diff --git a/Docs/sphinx_doc/LinearSolvers.rst b/Docs/sphinx_doc/LinearSolvers.rst
@@ -7,6 +7,18 @@
 Linear Solvers
 ==============
 
-Evolving the anelastic equation set requires solution of a Poisson equation in which we solve for the update to the perturbational pressure at cell centers.   AMReX provides several solver options: geometric multigrid, Fast Fourier Transforms (FFTs) and preconditioned GMRES.  For simulations with no terrain or grid stretching, one of the FFT options is generally the fastest solver, followed by multigrid.  We note that the multigrid solver has the option to ``ignore'' a coordinate direction if the domain is only one cell wide in that direction; this allows for efficient solution of effectively 2D problems.  Multigrid can also be used when the union of grids at a level is not in itself rectangular; the FFT solvers do not work in that general case.
+Evolving the anelastic equation set requires solution of a Poisson equation in which we solve for the update to the perturbational pressure at cell centers.
+ERF uses several solver options available through AMReX: geometric multigrid, Fast Fourier Transforms (FFTs) and preconditioned GMRES.
+For simulations with no terrain or grid stretching, one of the FFT options is generally the fastest solver,
+followed by multigrid.  We note that the multigrid solver has the option to ``ignore'' a coordinate direction
+if the domain is only one cell wide in that direction; this allows for efficient solution of effectively 2D problems.
+Multigrid can also be used when the union of grids at a level is not in itself rectangular; the FFT solvers do not work in that general case.
 
-For simulations using grid stretching in the vertical but flat terrain, we must use the hybrid FFT solver in which we perform 2D transforms only in the lateral directions and couple the solution in the vertical direction with a tridiagonal solve. In both these cases we use a 7-point stencil. To solve the Poisson equation on terrain-fitted coordinates with general terrain, we rely on the preconditioned GMRES solver since the stencil effectively has variable coefficients and requires 19 points.
+For simulations using grid stretching in the vertical but flat terrain, we must use the hybrid FFT solver in which
+we perform 2D transforms only in the lateral directions and couple the solution in the vertical direction with a tridiagonal solve.
+In both these cases we use a 7-point stencil.
+To solve the Poisson equation on terrain-fitted coordinates with general terrain,
+we rely on the FFT-preconditioned GMRES solver since the stencil effectively has variable coefficients and requires 19 points.
+
+   .. note::
+      **Currently only doubly periodic lateral boundary conditions are supported by the hybrid FFT, and therefore by the GMRES solver.  More general boundary conditions are a work in progress.**
diff --git a/Source/LinearSolvers/ERF_FFT_TerrainPrecond.H b/Source/LinearSolvers/ERF_FFT_TerrainPrecond.H
@@ -0,0 +1,262 @@
+#ifndef ERF_FFT_TERRAIN_PRECOND_H_
+#define ERF_FFT_TERRAIN_PRECOND_H_
+
+#include <AMReX_FFT.H>
+#include <AMReX_FFT_Poisson.H>
+#include <AMReX_Geometry.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief 3D Preconditioner for terrain problems with periodic boundaries in the first two
+ * dimensions and Neumann in the last dimension.
+ */
+template <typename MF = MultiFab>
+class PoissonTerrainPrecond
+{
+public:
+    using T = typename MF::value_type;
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit PoissonTerrainPrecond (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true))
+    {
+        AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1));
+    }
+
+    void solve (MF& soln, MF const& rhs, MF const& height);
+
+    template <typename DZ>
+    void solve_doit (MF& soln, MF const& rhs, MF const& height, DZ const& dz); // has to be public for cuda
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+template <typename MF>
+void PoissonTerrainPrecond<MF>::solve (MF& soln, MF const& rhs, MF const& height)
+{
+    auto delz = T(m_geom.CellSize(AMREX_SPACEDIM-1));
+    solve_doit(soln, rhs, height, fft_poisson_detail::DZ<T>{delz});
+}
+
+template <typename MF>
+template <typename DZ>
+void PoissonTerrainPrecond<MF>::solve_doit (MF& soln, MF const& rhs, MF const& height, DZ const& dz)
+{
+    BL_PROFILE("FFT::PoissonTerrainPrecond::solve");
+
+#if (AMREX_SPACEDIM < 3)
+    amrex::ignore_unused(soln, rhs, dz);
+#else
+    auto facx = T(2)*Math::pi<T>()/T(m_geom.ProbLength(0));
+    auto facy = T(2)*Math::pi<T>()/T(m_geom.ProbLength(1));
+
+    auto dx =T(m_geom.CellSize(0));
+    auto dy = T(m_geom.CellSize(1));
+
+    auto dxinv = T(m_geom.InvCellSize(0));
+    auto dyinv = T(m_geom.InvCellSize(1));
+
+    auto scale = T(1.0)/(T(m_geom.Domain().length(0)) *
+                         T(m_geom.Domain().length(1)));
+    auto ny = m_geom.Domain().length(1);
+    auto nz = m_geom.Domain().length(2);
+
+    Box cdomain = m_geom.Domain();
+    cdomain.setBig(0,cdomain.length(0)/2);
+    auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(),
+                                {AMREX_D_DECL(true,true,false)});
+    DistributionMapping dm = detail::make_iota_distromap(cba.size());
+    FabArray<BaseFab<GpuComplex<T> > > spmf(cba, dm, 1, 0);
+
+    m_r2c.forward(rhs, spmf);
+
+    for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
+    {
+        auto const& spectral = spmf.array(mfi);
+        auto const& box = mfi.validbox();
+        auto const& xybox = amrex::makeSlab(box, 2, 0);
+
+        auto const zp = height.const_array(mfi);
+
+#ifdef AMREX_USE_GPU
+        // xxxxx TODO: We need to explore how to optimize this
+        // function. Maybe we can use cusparse. Maybe we should make
+        // z-direction to be the unit stride direction.
+
+        FArrayBox tridiag_workspace(box,4);
+        auto const& ald = tridiag_workspace.array(0);
+        auto const& bd = tridiag_workspace.array(1);
+        auto const& cud = tridiag_workspace.array(2);
+        auto const& scratch = tridiag_workspace.array(3);
+
+        amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                Real hzeta_inv_on_cc = 4.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                              -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                if(k==0) {
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald(i,j,k) = 0.;
+                    cud(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+
+                } else if (k == nz-1) {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+                    ald(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud(i,j,k) = 0.;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+                    if (i == 0 && j == 0) {
+                        bd(i,j,k) *= 2.0;
+                    }
+                } else {
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud(i,j,k) = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd(i,j,k) = k2 - ald(i,j,k) - cud(i,j,k);
+
+                }
+            }
+
+            scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
+            spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
+                    / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+        Gpu::streamSynchronize();
+
+#else
+
+        Gpu::DeviceVector<GpuComplex<Real>> ald(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> bd(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> cud(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> scratch(nz);
+
+        amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+
+                Real hzeta_inv_on_cc = 4.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                              -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+
+                if(k==0) {
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald[k] = 0.;
+                    cud[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd[k]  = k2 -ald[k]-cud[k];
+
+                } else if (k == nz-1) {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    ald[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud[k] = 0.;
+                    bd[k]  = k2 -ald[k]-cud[k];
+
+                    if (i == 0 && j == 0) {
+                        bd[k] *= 2.0;
+                    }
+                } else {
+
+                    Real hzeta_inv_on_zlo = 8.0 / ( (zp(i,j,k+1) + zp(i+1,j,k+1) + zp(i,j+1,k+1) + zp(i+1,j+1,k+1))
+                                                   -(zp(i,j,k-1) + zp(i+1,j,k-1) + zp(i,j+1,k-1) + zp(i+1,j+1,k-1)) );
+                    Real h_xi_on_zlo  = 0.5 * (zp(i+1,j+1,k  ) + zp(i+1,j,k  ) - zp(i,j+1,k  ) - zp(i,j,k  )) * dxinv;
+                    Real h_eta_on_zlo = 0.5 * (zp(i+1,j+1,k  ) + zp(i,j+1,k  ) - zp(i+1,j,k  ) - zp(i,j,k  )) * dyinv;
+
+                    Real hzeta_inv_on_zhi = 8.0 / ( (zp(i,j,k+2) + zp(i+1,j,k+2) + zp(i,j+1,k+2) + zp(i+1,j+1,k+2))
+                                                   -(zp(i,j,k  ) + zp(i+1,j,k  ) + zp(i,j+1,k  ) + zp(i+1,j+1,k  )) );
+                    Real h_xi_on_zhi  = 0.5 * (zp(i+1,j+1,k+1) + zp(i+1,j,k+1) - zp(i,j+1,k+1) - zp(i,j,k+1)) * dxinv;
+                    Real h_eta_on_zhi = 0.5 * (zp(i+1,j+1,k+1) + zp(i,j+1,k+1) - zp(i+1,j,k+1) - zp(i,j,k+1)) * dyinv;
+
+                    ald[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zlo*h_xi_on_zlo + h_eta_on_zlo*h_eta_on_zlo) * hzeta_inv_on_zlo;
+                    cud[k] = hzeta_inv_on_cc * (1.0 + h_xi_on_zhi*h_xi_on_zhi + h_eta_on_zhi*h_eta_on_zhi) * hzeta_inv_on_zhi;
+                    bd[k]  = k2 - ald[k] - cud[k];
+                }
+            }
+
+            scratch[0] = cud[0]/bd[0];
+            spectral(i,j,0) = spectral(i,j,0)/bd[0];
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
+                    / (bd[k] - ald[k] * scratch[k-1]);
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+#endif
+    }
+
+    m_r2c.backward(spmf, soln);
+#endif
+}
+
+}
+
+#endif
diff --git a/Source/LinearSolvers/ERF_PoissonSolve.cpp b/Source/LinearSolvers/ERF_PoissonSolve.cpp
@@ -17,7 +17,6 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
 #endif
 
     bool l_use_terrain = SolverChoice::terrain_type != TerrainType::None;
-    bool use_gmres     = (l_use_terrain && !SolverChoice::terrain_is_flat);
 
     // Make sure the solver only sees the levels over which we are solving
     Vector<BoxArray>            ba_tmp;   ba_tmp.push_back(mom_mf[Vars::cons].boxArray());
@@ -111,27 +110,77 @@ void ERF::project_velocities (int lev, Real l_dt, Vector<MultiFab>& mom_mf, Mult
     // ****************************************************************************
     // Choose the solver and solve
     // ****************************************************************************
+
+    // ****************************************************************************
+    // EB
+    // ****************************************************************************
 #ifdef ERF_USE_EB
     solve_with_EB_mlmg(lev, rhs, phi, fluxes);
 #else
 
-#ifdef ERF_USE_FFT
     bool boxes_make_rectangle = (geom_tmp[0].Domain().numPts() == ba_tmp[0].numPts());
-    if (use_fft && boxes_make_rectangle) {
 
-        solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+    // ****************************************************************************
+    // No terrain or grid stretching
+    // ****************************************************************************
+    if (!l_use_terrain) {
+#ifdef ERF_USE_FFT
+        if (use_fft) {
+            if (boxes_make_rectangle) {
+                solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+            } else {
+                amrex::Warning("FFT won't work unless the boxArray covers the domain: defaulting to MLMG");
+                solve_with_mlmg(lev, rhs, phi, fluxes);
+            }
+        } else {
+            solve_with_mlmg(lev, rhs, phi, fluxes);
+        }
+#else
+        if (use_fft) {
+            amrex::Warning("use_fft can't be used unless you build with USE_FFT = TRUE; defaulting to MLMG");
+        }
+        solve_with_mlmg(lev, rhs, phi, fluxes);
+#endif
+    } // No terrain or grid stretching
 
-    } else
+    // ****************************************************************************
+    // Grid stretching (flat terrain)
+    // ****************************************************************************
+    else if (l_use_terrain && SolverChoice::terrain_is_flat) {
+#ifndef ERF_USE_FFT
+        amrex::Abort("Rebuild with USE_FFT = TRUE so you can use the FFT solver");
+#else
+        if (!boxes_make_rectangle) {
+            amrex::Abort("FFT won't work unless the boxArray covers the domain");
+        } else {
+            if (!use_fft) {
+                amrex::Warning("Using FFT even though you didnt set use_fft = 0; it's the best choice");
+            }
+            solve_with_fft(lev, rhs[0], phi[0], fluxes[0]);
+        }
 #endif
-    if (use_gmres)
-    {
-        solve_with_gmres(lev, rhs, phi, fluxes[0]);
-    }
-    else
-    {
-        solve_with_mlmg(lev, rhs, phi, fluxes);
-    }
+    } // grid stretching
+
+    // ****************************************************************************
+    // General terrain
+    // ****************************************************************************
+    else if (l_use_terrain && !SolverChoice::terrain_is_flat) {
+#ifdef ERF_USE_FFT
+        if (use_fft)
+        {
+            amrex::Warning("FFT solver does not work for general terrain: switching to GMRES");
+        }
+        if (!boxes_make_rectangle) {
+            amrex::Abort("FFT preconditioner for GMRES won't work unless the boxArray covers the domain");
+        } else {
+            solve_with_gmres(lev, rhs, phi, fluxes[0]);
+        }
+#else
+        amrex::Abort("Rebuild with USE_FFT = TRUE so you can use the FFT preconditioner for GMRES");
 #endif
+    } // general terrain
+
+#endif // not EB
 
     // ****************************************************************************
     // Print time in solve