diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst
index 8d6205f43a5..2a5957e40bc 100644
--- a/Docs/sphinx_documentation/source/FFT.rst
+++ b/Docs/sphinx_documentation/source/FFT.rst
@@ -7,10 +7,10 @@ FFT::R2C Class
 ==============
 
 Class template `FFT::R2C` supports discrete Fourier transforms between real
-and complex data. The name R2C indicates that the forward transform converts
-real data to complex data, while the backward transform converts complex
-data to real data. It should be noted that both directions of transformation
-are supported, not just from real to complex.
+and complex data across MPI processes. The name R2C indicates that the
+forward transform converts real data to complex data, while the backward
+transform converts complex data to real data. It should be noted that both
+directions of transformation are supported, not just from real to complex.
 
 The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP,
 SYCL and CPU builds, respectively. Because the parallel communication is
@@ -18,8 +18,7 @@ handled by AMReX, it does not need the parallel version of
 FFTW. Furthermore, there is no constraint on the domain decomposition such
 as one Box per process. This class performs parallel FFT on AMReX's parallel
 data containers (e.g., :cpp:`MultiFab` and
-:cpp:`FabArray<BaseFab<ComplexData<Real>>>`. For local FFT, the users can
-use FFTW, cuFFT, rocFFT, or oneMKL directly.
+:cpp:`FabArray<BaseFab<ComplexData<Real>>>`.
 
 Other than using column-majored order, AMReX follows the convention of
 FFTW. Applying the forward transform followed by the backward transform
@@ -68,6 +67,32 @@ object. Therefore, one should cache it for reuse if possible. Although
 :cpp:`std::unique_ptr<FFT::R2C<Real>>` to store an object in one's class.
 
 
+.. _sec:FFT:localr2c:
+
+FFT::LocalR2C Class
+===================
+
+Class template `FFT::LocalR2C` supports local discrete Fourier transforms
+between real and complex data. The name R2C indicates that the forward
+transform converts real data to complex data, while the backward transform
+converts complex data to real data. It should be noted that both directions
+of transformation are supported, not just from real to complex.
+
+Below is an example of using :cpp:`FFT::LocalR2C`.
+
+.. highlight:: c++
+
+::
+
+    MultiFab mf(...);
+    BaseFab<GpuComplex<T>> cfab;
+    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
+        FFT::LocalR2C fft(mfi.fabbox().length());
+        cfab.resize(IntVect(0), fft.spectralSize()-1);
+        fft.forward(mf[mfi].dataPtr(), cfab.dataPtr());
+    }
+
+
 Poisson Solver
 ==============
 
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index d3629c5fd31..9d9edeaeba5 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -14,6 +14,10 @@
 #include <AMReX_Geometry.H>
 #include <AMReX_Gpu.H>
 
+#ifdef AMREX_USE_FFT
+#include <AMReX_FFT.H>
+#endif
+
 #ifdef AMREX_USE_HYPRE
 #include <_hypre_utilities.h>
 #ifdef AMREX_USE_CUDA
@@ -655,6 +659,10 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
     AsyncOut::Initialize();
     VectorGrowthStrategy::Initialize();
 
+#ifdef AMREX_USE_FFT
+    FFT::Initialize();
+#endif
+
 #ifdef AMREX_USE_EB
     EB2::Initialize();
 #endif
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 155cdcd4ddd..d911349a61f 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -35,13 +35,6 @@
 #include <roctracer/roctx.h>
 #endif
 #endif
-#if defined(AMREX_USE_FFT)
-#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
-#    include <rocfft/rocfft.h>
-#  else
-#    include <rocfft.h>
-#  endif
-#endif
 #endif
 
 #ifdef AMREX_USE_ACC
@@ -317,10 +310,6 @@ Device::Initialize ()
     }
 #endif /* AMREX_USE_MPI */
 
-#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
-    AMREX_ROCFFT_SAFE_CALL(rocfft_setup());
-#endif
-
     if (amrex::Verbose()) {
 #if defined(AMREX_USE_CUDA)
         amrex::Print() << "CUDA"
@@ -360,10 +349,6 @@ Device::Finalize ()
 #ifdef AMREX_USE_GPU
     Device::profilerStop();
 
-#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
-    AMREX_ROCFFT_SAFE_CALL(rocfft_cleanup());
-#endif
-
 #ifdef AMREX_USE_SYCL
     for (auto& s : gpu_stream_pool) {
         delete s.queue;
diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H
index cd7c0984c59..11bf4f4cc81 100644
--- a/Src/FFT/AMReX_FFT.H
+++ b/Src/FFT/AMReX_FFT.H
@@ -2,8 +2,16 @@
 #define AMREX_FFT_H_
 #include <AMReX_Config.H>
 
+#include <AMReX_FFT_LocalR2C.H>
 #include <AMReX_FFT_OpenBCSolver.H>
 #include <AMReX_FFT_R2C.H>
 #include <AMReX_FFT_R2X.H>
 
+namespace amrex::FFT
+{
+    void Initialize ();
+    void Finalize ();
+    void Clear ();
+}
+
 #endif
diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp
index 8bf7b5fd9f3..91ac1a7a92f 100644
--- a/Src/FFT/AMReX_FFT.cpp
+++ b/Src/FFT/AMReX_FFT.cpp
@@ -1,5 +1,87 @@
+#include <AMReX_FFT.H>
 #include <AMReX_FFT_Helper.H>
 
+#include <map>
+
+namespace amrex::FFT
+{
+
+namespace
+{
+    bool s_initialized = false;
+    std::map<Key, PlanD> s_plans_d;
+    std::map<Key, PlanF> s_plans_f;
+}
+
+void Initialize ()
+{
+    if (!s_initialized)
+    {
+        s_initialized = true;
+
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_setup());
+#endif
+    }
+
+    amrex::ExecOnFinalize(amrex::FFT::Finalize);
+}
+
+void Finalize ()
+{
+    if (s_initialized)
+    {
+        s_initialized = false;
+
+        Clear();
+
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_FFT)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_cleanup());
+#endif
+    }
+}
+
+void Clear ()
+{
+    for (auto& [k, p] : s_plans_d) {
+        Plan<double>::destroy_vendor_plan(p);
+    }
+
+    for (auto& [k, p] : s_plans_f) {
+        Plan<float>::destroy_vendor_plan(p);
+    }
+}
+
+PlanD* get_vendor_plan_d (Key const& key)
+{
+    if (auto found = s_plans_d.find(key); found != s_plans_d.end()) {
+        return &(found->second);
+    } else {
+        return nullptr;
+    }
+}
+
+PlanF* get_vendor_plan_f (Key const& key)
+{
+    if (auto found = s_plans_f.find(key); found != s_plans_f.end()) {
+        return &(found->second);
+    } else {
+        return nullptr;
+    }
+}
+
+void add_vendor_plan_d (Key const& key, PlanD plan)
+{
+    s_plans_d[key] = plan;
+}
+
+void add_vendor_plan_f (Key const& key, PlanF plan)
+{
+    s_plans_f[key] = plan;
+}
+
+}
+
 namespace amrex::FFT::detail
 {
 
diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H
index 315e0641ac9..1ef2d0180a3 100644
--- a/Src/FFT/AMReX_FFT_Helper.H
+++ b/Src/FFT/AMReX_FFT_Helper.H
@@ -36,6 +36,7 @@
 #include <complex>
 #include <limits>
 #include <memory>
+#include <tuple>
 #include <utility>
 #include <variant>
 
@@ -148,31 +149,22 @@ struct Plan
     std::size_t work_size = 0;
 #endif
 
+#ifdef AMREX_USE_GPU
+    void set_ptrs (void* p0, void* p1) {
+        pf = p0;
+        pb = p1;
+    }
+#endif
+
     void destroy ()
     {
         if (defined) {
-#if defined(AMREX_USE_CUDA)
-            AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
-#elif defined(AMREX_USE_HIP)
-            AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
-#elif defined(AMREX_USE_SYCL)
-            std::visit([](auto&& p) { delete p; }, plan);
-#else
-            if constexpr (std::is_same_v<float,T>) {
-                fftwf_destroy_plan(plan);
-            } else {
-                fftw_destroy_plan(plan);
-            }
-#endif
+            destroy_vendor_plan(plan);
             defined = false;
         }
 #if !defined(AMREX_USE_GPU)
         if (defined2) {
-            if constexpr (std::is_same_v<float,T>) {
-                fftwf_destroy_plan(plan2);
-            } else {
-                fftw_destroy_plan(plan2);
-            }
+            destroy_vendor_plan(plan2);
             defined2 = false;
         }
 #endif
@@ -211,19 +203,20 @@ struct Plan
         amrex::ignore_unused(nc);
 
 #if defined(AMREX_USE_CUDA)
+
         AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
         AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
         if constexpr (D == Direction::forward) {
             cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
             AMREX_CUFFT_SAFE_CALL
                 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nr, nullptr, 1, nc, fwd_type, howmany, &work_size));
-            AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
         } else {
             cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
             AMREX_CUFFT_SAFE_CALL
                 (cufftMakePlanMany(plan, rank, len, nullptr, 1, nc, nullptr, 1, nr, bwd_type, howmany, &work_size));
-            AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
         }
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
+
 #elif defined(AMREX_USE_HIP)
 
         auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
@@ -300,6 +293,9 @@ struct Plan
 #endif
     }
 
+    template <Direction D, int M>
+    void init_r2c (IntVectND<M> const& fft_size, void* buffer, bool cache);
+
     template <Direction D>
     void init_c2c (Box const& box, VendorComplex* p)
     {
@@ -1097,10 +1093,191 @@ struct Plan
             if (defined2) { fftw_execute(plan2); }
         }
 
+#endif
+    }
+
+    static void destroy_vendor_plan (VendorPlan plan)
+    {
+#if defined(AMREX_USE_CUDA)
+        AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan));
+#elif defined(AMREX_USE_HIP)
+        AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan));
+#elif defined(AMREX_USE_SYCL)
+        std::visit([](auto&& p) { delete p; }, plan);
+#else
+        if constexpr (std::is_same_v<float,T>) {
+            fftwf_destroy_plan(plan);
+        } else {
+            fftw_destroy_plan(plan);
+        }
 #endif
     }
 };
 
+using Key = std::tuple<IntVectND<3>,Direction,Kind>;
+using PlanD = typename Plan<double>::VendorPlan;
+using PlanF = typename Plan<float>::VendorPlan;
+
+PlanD* get_vendor_plan_d (Key const& key);
+PlanF* get_vendor_plan_f (Key const& key);
+
+void add_vendor_plan_d (Key const& key, PlanD plan);
+void add_vendor_plan_f (Key const& key, PlanF plan);
+
+template <typename T>
+template <Direction D, int M>
+void Plan<T>::init_r2c (IntVectND<M> const& fft_size, void* buffer, bool cache)
+{
+    static_assert(D == Direction::forward || D == Direction::backward);
+
+    kind = (D == Direction::forward) ? Kind::r2c_f : Kind::r2c_b;
+    defined = true;
+    pf = buffer;
+    pb = buffer;
+
+    n = 1;
+    for (auto s : fft_size) { n *= s; }
+    howmany = 1;
+
+    Key key = {fft_size.template expand<3>(), D, kind};
+    VendorPlan* cached_plan = nullptr;
+    if (cache) {
+        if constexpr (std::is_same_v<float,T>) {
+            cached_plan = get_vendor_plan_f(key);
+        } else {
+            cached_plan = get_vendor_plan_d(key);
+        }
+#if defined(AMREX_USE_GPU)
+        if (cached_plan) {
+            plan = *cached_plan;
+            return;
+        }
+#endif
+    }
+
+#if defined(AMREX_USE_CUDA)
+
+    AMREX_CUFFT_SAFE_CALL(cufftCreate(&plan));
+    AMREX_CUFFT_SAFE_CALL(cufftSetAutoAllocation(plan, 0));
+    cufftType type;
+    if constexpr (D == Direction::forward) {
+        type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+    } else {
+        type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
+    }
+    if constexpr (M == 1) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan1d(plan, fft_size[0], type, howmany, &work_size));
+    } else if constexpr (M == 2) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan2d(plan, fft_size[1], fft_size[0], type, &work_size));
+    } else if constexpr (M == 3) {
+        AMREX_CUFFT_SAFE_CALL
+            (cufftMakePlan3d(plan, fft_size[2], fft_size[1], fft_size[0], type, &work_size));
+    }
+    AMREX_CUFFT_SAFE_CALL(cufftSetStream(plan, Gpu::gpuStream()));
+
+#elif defined(AMREX_USE_HIP)
+
+    auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+    std::size_t length[M];
+    for (int idim = 0; idim < M; ++idim) { length[idim] = fft_size[idim]; }
+    if constexpr (D == Direction::forward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                rocfft_transform_type_real_forward, prec, M,
+                                length, howmany, nullptr));
+    } else {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&plan, rocfft_placement_notinplace,
+                                rocfft_transform_type_real_inverse, prec, M,
+                                length, howmany, nullptr));
+    }
+
+#elif defined(AMREX_USE_SYCL)
+
+    mkl_desc_r* pp;
+    if (M == 1) {
+        pp = new mkl_desc_r(fft_size[0]);
+    } else {
+        std::vector<std::int64_t> len(M);
+        for (int idim = 0; idim < M; ++idim) {
+            len[idim] = fft_size[M-1-idim];
+        }
+        pp = new mkl_desc_r(len);
+    }
+#ifndef AMREX_USE_MKL_DFTI_2024
+    pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                  oneapi::mkl::dft::config_value::NOT_INPLACE);
+#else
+    pp->set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_NOT_INPLACE);
+#endif
+
+    std::vector<std::int64_t> strides(M+1);
+    strides[0] = 0;
+    strides[M] = 1;
+    for (int i = M-1; i >= 1; --i) {
+        strides[i] = strides[i+1] * fft_size[M-1-i];
+    }
+#ifndef AMREX_USE_MKL_DFTI_2024
+    pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides);
+    pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides);
+#else
+    pp->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+    pp->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+#endif
+    pp->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                  oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+    pp->commit(amrex::Gpu::Device::streamQueue());
+    plan = pp;
+
+#else /* FFTW */
+
+    int size_for_row_major[M];
+    for (int idim = 0; idim < M; ++idim) {
+        size_for_row_major[idim] = fft_size[M-1-idim];
+    }
+
+    if constexpr (std::is_same_v<float,T>) {
+        if constexpr (D == Direction::forward) {
+            plan = fftwf_plan_dft_r2c
+                (M, size_for_row_major, (float*)buffer, (fftwf_complex*)buffer,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftwf_plan_dft_c2r
+                (M, size_for_row_major, (fftwf_complex*)buffer, (float*)buffer,
+                 FFTW_ESTIMATE);
+        }
+    } else {
+        if constexpr (D == Direction::forward) {
+            plan = fftw_plan_dft_r2c
+                (M, size_for_row_major, (double*)buffer, (fftw_complex*)buffer,
+                 FFTW_ESTIMATE);
+        } else {
+            plan = fftw_plan_dft_c2r
+                (M, size_for_row_major, (fftw_complex*)buffer, (double*)buffer,
+                 FFTW_ESTIMATE);
+        }
+    }
+#endif
+
+    if (cache) {
+#if !defined(AMREX_USE_GPU)
+        if (cached_plan) {
+            destroy_vendor_plan(*cached_plan);
+            *cached_plan = plan;
+        } else
+#endif
+        {
+            if constexpr (std::is_same_v<float,T>) {
+                add_vendor_plan_f(key, plan);
+            } else {
+                add_vendor_plan_d(key, plan);
+            }
+        }
+    }
+}
+
 namespace detail
 {
     DistributionMapping make_iota_distromap (Long n);
diff --git a/Src/FFT/AMReX_FFT_LocalR2C.H b/Src/FFT/AMReX_FFT_LocalR2C.H
new file mode 100644
index 00000000000..f53a61f2e12
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_LocalR2C.H
@@ -0,0 +1,247 @@
+#ifndef AMREX_FFT_LOCAL_R2C_H_
+#define AMREX_FFT_LOCAL_R2C_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_FFT_Helper.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Local Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms between real and complex data. The
+ * name R2C indicates that the forward transform converts real data to
+ * complex data, while the backward transform converts complex data to real
+ * data. It should be noted that both directions of transformation are
+ * supported, not just from real to complex. The scaling follows the FFTW
+ * convention, where applying the forward transform followed by the backward
+ * transform scales the original data by the size of the input array.
+ *
+ * For more details, we refer the users to
+ * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
+ */
+template <typename T, FFT::Direction D = FFT::Direction::both,
+          int M = AMREX_SPACEDIM>
+class LocalR2C
+{
+public:
+    /**
+     * \brief Constructor
+     *
+     * \param domain the forward domain (i.e., the domain of the real data)
+     * \param info optional information
+     */
+    explicit LocalR2C (IntVectND<M> const& fft_size, bool cache_plan = true);
+
+    ~LocalR2C ();
+
+    LocalR2C (LocalR2C const&) = delete;
+    LocalR2C (LocalR2C &&) = delete;
+    LocalR2C& operator= (LocalR2C const&) = delete;
+    LocalR2C& operator= (LocalR2C &&) = delete;
+
+    /**
+     * \brief Forward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform. For GPUs, this function is
+     * synchronous on the host.
+     *
+     * \param indata input data
+     * \param outdata output data
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (T const* indata, GpuComplex<T>* outdata);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for forward-only transform. For GPUs, this function is
+     * synchronous on the host.
+     *
+     * \param indata input data
+     * \param outdata output data
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
+                                                DIR == Direction::both, int> = 0>
+    void backward (GpuComplex<T> const* indata, T* outdata);
+
+    //! Scaling factor. If the data goes through forward and then backward,
+    //! the result multiplied by the scaling factor is equal to the original
+    //! data.
+    [[nodiscard]] T scalingFactor () const;
+
+    //! Spectral domain size
+    [[nodiscard]] IntVectND<M> const& spectralSize () const {
+        return m_spectral_size;
+    }
+
+private:
+
+    Plan<T> m_fft_fwd;
+    Plan<T> m_fft_bwd;
+
+    void* m_buffer = nullptr; // for fftw
+
+#if defined(AMREX_USE_GPU)
+    gpuStream_t m_gpu_stream;
+#endif
+
+    IntVectND<M> m_real_size;
+    IntVectND<M> m_spectral_size;
+
+    bool m_cache_plan;
+};
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>::LocalR2C (IntVectND<M> const& fft_size, bool cache_plan)
+    : m_real_size(fft_size),
+      m_spectral_size(fft_size),
+      m_cache_plan(cache_plan)
+{
+    BL_PROFILE("FFT::LocalR2C");
+    m_spectral_size[0] = m_real_size[0]/2 + 1;
+
+#if defined(AMREX_USE_GPU)
+    auto current_stream = Gpu::gpuStream();
+    Gpu::Device::resetStreamIndex();
+    m_gpu_stream = Gpu::gpuStream();
+#else
+    Long total_size = 1;
+    for (auto s : m_spectral_size) {
+        total_size *= s;
+    }
+    m_buffer = The_Arena()->alloc(sizeof(GpuComplex<T>)*total_size);
+#endif
+
+#ifdef AMREX_USE_SYCL
+    m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, m_buffer, cache_plan);
+    m_fft_bwd = m_fft_fwd;
+#else
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        m_fft_fwd.template init_r2c<Direction::forward,M>(m_real_size, m_buffer, cache_plan);
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        m_fft_bwd.template init_r2c<Direction::backward,M>(m_real_size, m_buffer, cache_plan);
+    }
+#endif
+
+#if defined(AMREX_USE_GPU)
+    Gpu::Device::setStream(current_stream);
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+LocalR2C<T,D,M>::~LocalR2C ()
+{
+    static_assert(M >= 1 && M <= 3);
+
+    if (m_buffer) { The_Arena()->free(m_buffer); }
+
+    if (!m_cache_plan) {
+        if (m_fft_bwd.plan != m_fft_fwd.plan) {
+            m_fft_bwd.destroy();
+        }
+        m_fft_fwd.destroy();
+    }
+}
+
+template <typename T, FFT::Direction D, int M>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void LocalR2C<T,D,M>::forward (T const* indata, GpuComplex<T>* outdata)
+{
+    BL_PROFILE("FFT::LocalR2C::forward");
+
+#if defined(AMREX_USE_GPU)
+    m_fft_fwd.set_ptrs((void*)indata, (void*)outdata);
+    auto current_stream = Gpu::gpuStream();
+    if (current_stream != m_gpu_stream) {
+        Gpu::streamSynchronize();
+        Gpu::Device::setStream(m_gpu_stream);
+    }
+#else
+    int ny = 1;
+    for (int idim = 1; idim < M; ++idim) {
+        ny *= m_real_size[idim];
+    }
+    auto* pdst = (char*)m_buffer;
+    auto* psrc = (char*)indata;
+    for (int j = 0; j < ny; ++j) {
+        std::memcpy(pdst, psrc, sizeof(T) * m_real_size[0]);
+        pdst += sizeof(GpuComplex<T>) * m_spectral_size[0];
+        psrc += sizeof(           T ) *     m_real_size[0];
+    }
+#endif
+
+    m_fft_fwd.template compute_r2c<Direction::forward>();
+
+#if defined(AMREX_USE_GPU)
+    if (current_stream != m_gpu_stream) {
+        Gpu::Device::setStream(current_stream);
+    }
+#else
+    std::size_t nbytes = sizeof(GpuComplex<T>);
+    for (auto s : m_spectral_size) { nbytes *= s; }
+    std::memcpy(outdata, m_buffer, nbytes);
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
+                                          DIR == Direction::both, int> >
+void LocalR2C<T,D,M>::backward (GpuComplex<T> const* indata, T* outdata)
+{
+    BL_PROFILE("FFT::LocalR2C::backward");
+
+#if defined(AMREX_USE_GPU)
+    m_fft_bwd.set_ptrs((void*)outdata, (void*)indata);
+    auto current_stream = Gpu::gpuStream();
+    if (current_stream != m_gpu_stream) {
+        Gpu::streamSynchronize();
+        Gpu::Device::setStream(m_gpu_stream);
+    }
+#else
+    std::size_t nbytes = sizeof(GpuComplex<T>);
+    for (auto s : m_spectral_size) { nbytes *= s; }
+    std::memcpy(m_buffer, indata, nbytes);
+#endif
+
+    m_fft_bwd.template compute_r2c<Direction::backward>();
+
+#if defined(AMREX_USE_GPU)
+    if (current_stream != m_gpu_stream) {
+        Gpu::Device::setStream(current_stream);
+    }
+#else
+    int ny = 1;
+    for (int idim = 1; idim < M; ++idim) {
+        ny *= m_real_size[idim];
+    }
+    auto* pdst = (char*)outdata;
+    auto* psrc = (char*)m_buffer;
+    for (int j = 0; j < ny; ++j) {
+        std::memcpy(pdst, psrc, sizeof(T) * m_real_size[0]);
+        pdst += sizeof(           T ) *     m_real_size[0];
+        psrc += sizeof(GpuComplex<T>) * m_spectral_size[0];
+    }
+#endif
+}
+
+template <typename T, FFT::Direction D, int M>
+T LocalR2C<T,D,M>::scalingFactor () const
+{
+    T r = 1;
+    for (auto s : m_real_size) {
+        r *= T(s);
+    }
+    return T(1)/r;
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_R2C.H b/Src/FFT/AMReX_FFT_R2C.H
index aaa5fac4c38..456a3ddf7d8 100644
--- a/Src/FFT/AMReX_FFT_R2C.H
+++ b/Src/FFT/AMReX_FFT_R2C.H
@@ -14,7 +14,7 @@ namespace amrex::FFT
 template <typename T> class OpenBCSolver;
 
 /**
- * \brief Discrete Fourier Transform
+ * \brief Parallel Discrete Fourier Transform
  *
  * This class supports Fourier transforms between real and complex data. The
  * name R2C indicates that the forward transform converts real data to
diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt
index cbb205dd2e1..6dd81507114 100644
--- a/Src/FFT/CMakeLists.txt
+++ b/Src/FFT/CMakeLists.txt
@@ -7,6 +7,7 @@ foreach(D IN LISTS AMReX_SPACEDIM)
        PRIVATE
        AMReX_FFT.H
        AMReX_FFT.cpp
+       AMReX_FFT_LocalR2C.H
        AMReX_FFT_OpenBCSolver.H
        AMReX_FFT_R2C.H
        AMReX_FFT_R2X.H
diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package
index 82cc4803eab..fb369b7cafa 100644
--- a/Src/FFT/Make.package
+++ b/Src/FFT/Make.package
@@ -3,6 +3,7 @@ ifndef AMREX_FFT_MAKE
 
 CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H
 CEXE_headers += AMReX_FFT_OpenBCSolver.H AMReX_FFT_R2C.H AMReX_FFT_R2X.H
+CEXE_headers += AMReX_FFT_LocalR2C.H
 CEXE_sources += AMReX_FFT.cpp
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT
diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp
index 594a9ec760d..ee70b43b7bb 100644
--- a/Tests/FFT/R2C/main.cpp
+++ b/Tests/FFT/R2C/main.cpp
@@ -17,7 +17,7 @@ int main (int argc, char* argv[])
                      int n_cell_y = 32;,
                      int n_cell_z = 64);
 
-        AMREX_D_TERM(int max_grid_size_x = 32;,
+        AMREX_D_TERM(int max_grid_size_x = 64;,
                      int max_grid_size_y = 32;,
                      int max_grid_size_z = 32);
 
@@ -120,6 +120,49 @@ int main (int argc, char* argv[])
             auto eps = 1.e-6f;
 #else
             auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+
+        {
+            Real error = 0;
+            BaseFab<GpuComplex<Real>> cfab;
+            for (MFIter mfi(mf); mfi.isValid(); ++mfi)
+            {
+                auto& fab = mf[mfi];
+                auto& fab2 = mf2[mfi];
+                Box const& box = fab.box();
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::both> fft(box.length());
+                    Box cbox(IntVect(0), fft.spectralSize() - 1);
+                    cfab.resize(cbox);
+                    fft.forward(fab.dataPtr(), cfab.dataPtr());
+                    fft.backward(cfab.dataPtr(), fab2.dataPtr());
+                    auto fac = fft.scalingFactor();
+                    fab2.template xpay<RunOn::Device>(-fac, fab, box, box, 0, 0, 1);
+                    auto e = fab2.template norm<RunOn::Device>(0);
+                    error = std::max(e,error);
+                }
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::forward> fft(box.length());
+                    fft.forward(fab.dataPtr(), cfab.dataPtr());
+                }
+                {
+                    FFT::LocalR2C<Real,FFT::Direction::backward> fft(box.length());
+                    fft.backward(cfab.dataPtr(), fab2.dataPtr());
+                    auto fac = fft.scalingFactor();
+                    fab2.template xpay<RunOn::Device>(-fac, fab, box, box, 0, 0, 1);
+                    auto e = fab2.template norm<RunOn::Device>(0);
+                    error = std::max(e,error);
+                }
+            }
+
+            ParallelDescriptor::ReduceRealMax(error);
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
 #endif
             AMREX_ALWAYS_ASSERT(error < eps);
         }