diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
index 3dafadcd74b..30af9c9cfc0 100644
--- a/Src/Base/AMReX_BaseFabUtility.H
+++ b/Src/Base/AMReX_BaseFabUtility.H
@@ -36,37 +36,31 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
                   "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
 #ifdef AMREX_USE_GPU
     if (Gpu::inLaunchRegion()) {
-        const auto lo  = amrex::lbound(box);
-        const auto len = amrex::length(box);
-        const auto lenxy = len.x*len.y;
-        const auto lenx = len.x;
-        int ntotcells = box.numPts();
+        BoxIndexer indexer(box);
+        const auto ntotcells = std::uint64_t(box.numPts());
         int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
-        int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
+        AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
+        auto nblocks = int(nblocks_long);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
-            int icell = handler.globalIdx();
-            unsigned int blockDimx = handler.blockDim();
-            unsigned int threadIdxx = handler.threadIdx();
-            unsigned int blockIdxx = handler.blockIdx();
+            auto const icell = std::uint64_t(handler.globalIdx());
+            std::uint64_t const blockDimx = handler.blockDim();
+            std::uint64_t const threadIdxx = handler.threadIdx();
+            std::uint64_t const blockIdxx = handler.blockIdx();
             auto const shared = (T*)handler.sharedMemory();
             if (icell < ntotcells) {
                 auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             handler.sharedBarrier();
-            for (unsigned int m = threadIdxx,
-                     mend = amrex::min<unsigned int>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
+            for (std::uint64_t m = threadIdxx,
+                     mend = amrex::min<std::uint64_t>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
                  m < mend; m += blockDimx) {
                 p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
             }
@@ -75,24 +69,19 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
         amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
             Gpu::SharedMemory<T> gsm;
             T* const shared = gsm.dataPtr();
             if (icell < ntotcells) {
-                auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
-                int k =  icell /   lenxy;
-                int j = (icell - k*lenxy) /   lenx;
-                int i = (icell - k*lenxy) - j*lenx;
-                i += lo.x;
-                j += lo.y;
-                k += lo.z;
+                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
+                auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
             __syncthreads();
-            for (unsigned int m = threadIdx.x,
-                     mend = amrex::min<unsigned int>(blockDim.x, ntotcells-blockDim.x*blockIdx.x) * STRUCTSIZE;
+            for (std::uint64_t m = threadIdx.x,
+                     mend = amrex::min<std::uint64_t>(blockDim.x, ntotcells-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
                  m < mend; m += blockDim.x) {
-                p[blockDim.x*blockIdx.x*STRUCTSIZE+m] = shared[m];
+                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
             }
         });
 #endif
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index 0a32d637d4f..33d0716a86a 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -15,6 +15,7 @@
 #include <AMReX_Vector.H>
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_Math.H>
 
 #include <iosfwd>
 
@@ -1835,6 +1836,86 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
     return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
 }
 
+struct BoxIndexer
+{
+#if (AMREX_SPACEDIM == 3)
+    Math::FastDivmodU64 fdxy;
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
+          fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y, z, rem;
+        fdxy(z, rem, icell);
+        fdx(y, x, rem);
+        return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
+    }
+
+#elif (AMREX_SPACEDIM == 2)
+
+    Math::FastDivmodU64 fdx;
+    IntVect lo;
+
+    BoxIndexer (Box const& box)
+        : fdx (std::uint64_t(box.length(0))),
+          lo  (box.smallEnd())
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1], 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        std::uint64_t x, y;
+        fdx(y, x, icell);
+        return {int(x)+lo[0], int(y)+lo[1]};
+    }
+
+#elif (AMREX_SPACEDIM == 1)
+
+    int lo;
+
+    BoxIndexer (Box const& box)
+        : lo(box.smallEnd(0))
+        {}
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    Dim3 operator() (std::uint64_t icell) const
+    {
+        return {int(icell)+lo, 0, 0};
+    }
+
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    IntVect intVect (std::uint64_t icell) const
+    {
+        return IntVect{int(icell)+lo};
+    }
+
+#endif
+};
+
 }
 
 #endif /*AMREX_BOX_H*/
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index c4ba7dd86bc..ed61eadc13b 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -20,6 +20,7 @@
 #include <AMReX_GpuLaunchGlobal.H>
 #include <AMReX_RandomEngine.H>
 #include <AMReX_Algorithm.H>
+#include <AMReX_Math.H>
 #include <cstddef>
 #include <limits>
 #include <algorithm>
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 78e9e856535..a280968c773 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -23,8 +23,8 @@ template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L&& f) noexcept
 {
-    int nthreads_total = nthreads_per_block * nblocks;
-    std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
+    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
+    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
     auto& q = *(stream.queue);
     try {
@@ -47,7 +47,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
 {
-    int nthreads_total = nthreads_per_block * nblocks;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -68,8 +68,8 @@ template <int MT, typename L>
 void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
              L&& f) noexcept
 {
-    int nthreads_total = MT * nblocks;
-    std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
+    const auto nthreads_total = MT * std::size_t(nblocks);
+    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
         / sizeof(unsigned long long);
     auto& q = *(stream.queue);
     try {
@@ -93,7 +93,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
 template <int MT, typename L>
 void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
 {
-    int nthreads_total = MT * nblocks;
+    const auto nthreads_total = MT * std::size_t(nblocks);
     auto& q = *(stream.queue);
     try {
         q.submit([&] (sycl::handler& h) {
@@ -116,8 +116,8 @@ void launch (T const& n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -192,8 +192,8 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -206,11 +206,11 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
-                        int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0),
-                                                          (T)item.get_local_range(0));
-                        detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
+                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < std::size_t(n); i += stride) {
+                        int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
+                                                          item.get_local_range(0));
+                        detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                           n_active_threads});
                     }
                 });
@@ -223,9 +223,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
-                         i < n; i += stride) {
-                        detail::call_f(f, i, Gpu::Handler{&item});
+                    for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < std::size_t(n); i += stride) {
+                        detail::call_f(f, T(i), Gpu::Handler{&item});
                     }
                 });
             });
@@ -239,14 +239,11 @@ template <int MT, typename L>
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -259,16 +256,11 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
-                        int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
-                                                          (int)item.get_local_range(0));
+                        auto [i, j, k] = indexer(icell);
+                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                                 n_active_threads});
                     }
@@ -282,14 +274,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
+                        auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,Gpu::Handler{&item});
                     }
                 });
@@ -304,14 +291,11 @@ template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_in
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         if (info.hasReduction()) {
@@ -324,16 +308,11 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
-                        int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0),
-                                                          (int)item.get_local_range(0));
+                        auto [i, j, k] = indexer(icell);
+                        int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)),
+                                                          std::uint64_t(item.get_local_range(0)));
                         detail::call_f(f, i, j, k, ncomp,
                                        Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
                                                     n_active_threads});
@@ -348,14 +327,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
                 [[sycl::reqd_work_group_size(1,1,MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                    for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                          icell < ncells; icell += stride) {
-                        int k =  icell /   lenxy;
-                        int j = (icell - k*lenxy) /   lenx;
-                        int i = (icell - k*lenxy) - j*lenx;
-                        i += lo.x;
-                        j += lo.y;
-                        k += lo.z;
+                        auto [i, j, k] = indexer(icell);
                         detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
                     }
                 });
@@ -371,8 +345,8 @@ void ParallelForRNG (T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) { return; }
     const auto ec = Gpu::ExecutionConfig(n);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -384,11 +358,11 @@ void ParallelForRNG (T n, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (T i = tid, stride = item.get_global_range(0); i < n; i += stride) {
-                    f(i,rand_eng);
+                for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) {
+                    f(T(i),rand_eng);
                 }
                 engine_acc.store(engine, tid);
             });
@@ -403,14 +377,11 @@ template <typename L>
 void ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -422,17 +393,12 @@ void ParallelForRNG (Box const& box, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (int icell = tid, stride = item.get_global_range(0);
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
+                    auto [i, j, k] = indexer(icell);
                     f(i,j,k,rand_eng);
                 }
                 engine_acc.store(engine, tid);
@@ -448,14 +414,11 @@ template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T
 void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch());
     auto& q = Gpu::Device::streamQueue();
     auto& engdescr = *(getRandEngineDescriptor());
     try {
@@ -467,17 +430,12 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
             [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                int tid = item.get_global_id(0);
+                auto const tid = item.get_global_id(0);
                 auto engine = engine_acc.load(tid);
                 RandomEngine rand_eng{&engine};
-                for (int icell = tid, stride = item.get_global_range(0);
+                for (std::uint64_t icell = tid, stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
+                    auto [i, j, k] = indexer(icell);
                     for (T n = 0; n < ncomp; ++n) {
                         f(i,j,k,n,rand_eng);
                     }
@@ -495,20 +453,14 @@ template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -518,24 +470,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                        auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
                     if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                        auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
                 }
@@ -552,25 +494,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -580,33 +513,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                        auto [i, j, k] = indexer1(icell);
                         f1(i,j,k);
                     }
                     if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                        auto [i, j, k] = indexer2(icell);
                         f2(i,j,k);
                     }
                     if (icell < ncells3) {
-                        int k =  icell /   len3xy;
-                        int j = (icell - k*len3xy) /   len3x;
-                        int i = (icell - k*len3xy) - j*len3x;
-                        i += lo3.x;
-                        j += lo3.y;
-                        k += lo3.z;
+                        auto [i, j, k] = indexer3(icell);
                         f3(i,j,k);
                     }
                 }
@@ -625,20 +543,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -648,26 +560,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
                     if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
                         }
@@ -690,25 +592,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
-    int nthreads_per_block = ec.numThreads.x;
-    int nthreads_total = nthreads_per_block * ec.numBlocks.x;
+    const auto nthreads_per_block = ec.numThreads.x;
+    const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
         q.submit([&] (sycl::handler& h) {
@@ -718,37 +611,22 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             [[sycl::reqd_work_group_size(1,1,MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
                      icell < ncells; icell += stride) {
                     if (icell < ncells1) {
-                        int k =  icell /   len1xy;
-                        int j = (icell - k*len1xy) /   len1x;
-                        int i = (icell - k*len1xy) - j*len1x;
-                        i += lo1.x;
-                        j += lo1.y;
-                        k += lo1.z;
+                        auto [i, j, k] = indexer1(icell);
                         for (T1 n = 0; n < ncomp1; ++n) {
                             f1(i,j,k,n);
                         }
                     }
                     if (icell < ncells2) {
-                        int k =  icell /   len2xy;
-                        int j = (icell - k*len2xy) /   len2x;
-                        int i = (icell - k*len2xy) - j*len2x;
-                        i += lo2.x;
-                        j += lo2.y;
-                        k += lo2.z;
+                        auto [i, j, k] = indexer2(icell);
                         for (T2 n = 0; n < ncomp2; ++n) {
                             f2(i,j,k,n);
                         }
                     }
                     if (icell < ncells3) {
-                        int k =  icell /   len3xy;
-                        int j = (icell - k*len3xy) /   len3x;
-                        int i = (icell - k*len3xy) - j*len3x;
-                        i += lo3.x;
-                        j += lo3.y;
-                        k += lo3.z;
+                        auto [i, j, k] = indexer3(icell);
                         for (T3 n = 0; n < ncomp3; ++n) {
                             f3(i,j,k,n);
                         }
@@ -822,7 +700,7 @@ void launch (T const& n, L&& f) noexcept
 namespace detail {
     template <typename F, typename N>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, N /*nleft*/)
+    auto call_f (F const& f, N i, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0))
     {
         f(i);
@@ -830,15 +708,15 @@ namespace detail {
 
     template <typename F, typename N>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, N i, N nleft)
+    auto call_f (F const& f, N i, std::uint64_t nleft)
         noexcept -> decltype(f(0,Gpu::Handler{}))
     {
-        f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x)));
+        f(i,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 
     template <typename F>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, int /*nleft*/)
+    auto call_f (F const& f, int i, int j, int k, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0,0,0))
     {
         f(i,j,k);
@@ -846,15 +724,15 @@ namespace detail {
 
     template <typename F>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, int nleft)
+    auto call_f (F const& f, int i, int j, int k, std::uint64_t nleft)
         noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
     {
-        f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+        f(i,j,k,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 
     template <typename F, typename T>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/)
+    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t /*nleft*/)
         noexcept -> decltype(f(0,0,0,0))
     {
         for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
@@ -862,10 +740,10 @@ namespace detail {
 
     template <typename F, typename T>
     AMREX_GPU_DEVICE
-    auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft)
+    auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t nleft)
         noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
     {
-        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x)));
     }
 }
 
@@ -877,9 +755,9 @@ ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
     const auto ec = Gpu::makeExecutionConfig<MT>(n);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-             i < n; i += stride) {
-            detail::call_f(f, i, (n-i+(T)threadIdx.x));
+        for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x;
+             i < Long(n); i += stride) {
+            detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -890,24 +768,16 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride)
         {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x));
+            auto [i, j, k] = indexer(icell);
+            detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -918,23 +788,15 @@ std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
-            detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x));
+            auto [i, j, k] = indexer(icell);
+            detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x));
         }
     });
     AMREX_GPU_ERROR_CHECK();
@@ -951,10 +813,10 @@ ParallelForRNG (T n, L&& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (T i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) {
-            f(i,engine);
+        for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
+            f(T(i),engine);
         }
     });
     Gpu::streamSynchronize(); // To avoid multiple streams using RNG
@@ -967,25 +829,17 @@ ParallelForRNG (Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) {
+            auto [i, j, k] = indexer(icell);
             f(i,j,k,engine);
         }
     });
@@ -999,25 +853,17 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) { return; }
     randState_t* rand_state = getRandState();
-    int ncells = box.numPts();
-    const auto lo  = amrex::lbound(box);
-    const auto len = amrex::length(box);
-    const auto lenxy = len.x*len.y;
-    const auto lenx = len.x;
+    const auto ncells = std::uint64_t(box.numPts());
+    const BoxIndexer indexer(box);
     const auto ec = Gpu::ExecutionConfig(ncells);
     AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        int tid = blockDim.x*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) {
-            int k =  icell /   lenxy;
-            int j = (icell - k*lenxy) /   lenx;
-            int i = (icell - k*lenxy) - j*lenx;
-            i += lo.x;
-            j += lo.y;
-            k += lo.z;
+        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) {
+            auto [i, j, k] = indexer(icell);
             for (T n = 0; n < ncomp; ++n) {
                 f(i,j,k,n,engine);
             }
@@ -1033,38 +879,22 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+                auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
             if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+                auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
         }
@@ -1079,52 +909,28 @@ ParallelFor (Gpu::KernelInfo const&,
              L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+                auto [i, j, k] = indexer1(icell);
                 f1(i,j,k);
             }
             if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+                auto [i, j, k] = indexer2(icell);
                 f2(i,j,k);
             }
             if (icell < ncells3) {
-                int k =  icell /   len3xy;
-                int j = (icell - k*len3xy) /   len3x;
-                int i = (icell - k*len3xy) - j*len3x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
+                auto [i, j, k] = indexer3(icell);
                 f3(i,j,k);
             }
         }
@@ -1141,40 +947,24 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells = amrex::max(ncells1, ncells2);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
             if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
                 }
@@ -1195,56 +985,32 @@ ParallelFor (Gpu::KernelInfo const&,
              Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; }
-    int ncells1 = box1.numPts();
-    int ncells2 = box2.numPts();
-    int ncells3 = box3.numPts();
-    int ncells = amrex::max(ncells1, ncells2, ncells3);
-    const auto lo1  = amrex::lbound(box1);
-    const auto lo2  = amrex::lbound(box2);
-    const auto lo3  = amrex::lbound(box3);
-    const auto len1 = amrex::length(box1);
-    const auto len2 = amrex::length(box2);
-    const auto len3 = amrex::length(box3);
-    const auto len1xy = len1.x*len1.y;
-    const auto len2xy = len2.x*len2.y;
-    const auto len3xy = len3.x*len3.y;
-    const auto len1x = len1.x;
-    const auto len2x = len2.x;
-    const auto len3x = len3.x;
+    const auto ncells1 = std::uint64_t(box1.numPts());
+    const auto ncells2 = std::uint64_t(box2.numPts());
+    const auto ncells3 = std::uint64_t(box3.numPts());
+    const auto ncells = amrex::max(ncells1, ncells2, ncells3);
+    const BoxIndexer indexer1(box1);
+    const BoxIndexer indexer2(box2);
+    const BoxIndexer indexer3(box3);
     const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < ncells1) {
-                int k =  icell /   len1xy;
-                int j = (icell - k*len1xy) /   len1x;
-                int i = (icell - k*len1xy) - j*len1x;
-                i += lo1.x;
-                j += lo1.y;
-                k += lo1.z;
+                auto [i, j, k] = indexer1(icell);
                 for (T1 n = 0; n < ncomp1; ++n) {
                     f1(i,j,k,n);
                 }
             }
             if (icell < ncells2) {
-                int k =  icell /   len2xy;
-                int j = (icell - k*len2xy) /   len2x;
-                int i = (icell - k*len2xy) - j*len2x;
-                i += lo2.x;
-                j += lo2.y;
-                k += lo2.z;
+                auto [i, j, k] = indexer2(icell);
                 for (T2 n = 0; n < ncomp2; ++n) {
                     f2(i,j,k,n);
                 }
             }
             if (icell < ncells3) {
-                int k =  icell /   len3xy;
-                int j = (icell - k*len3xy) /   len3x;
-                int i = (icell - k*len3xy) - j*len3x;
-                i += lo3.x;
-                j += lo3.y;
-                k += lo3.z;
+                auto [i, j, k] = indexer3(icell);
                 for (T3 n = 0; n < ncomp3; ++n) {
                     f3(i,j,k,n);
                 }
diff --git a/Src/Base/AMReX_INT.H b/Src/Base/AMReX_INT.H
index f8ab0e9ba8f..f54852cb64b 100644
--- a/Src/Base/AMReX_INT.H
+++ b/Src/Base/AMReX_INT.H
@@ -31,4 +31,29 @@ namespace amrex {
 }
 #endif
 
+#if (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__))
+
+#define AMREX_INT128_SUPPORTED 1
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
+
+typedef unsigned __int128 amrex_uint128_t; // NOLINT(modernize-use-using)
+typedef          __int128 amrex_int128_t;  // NOLINT(modernize-use-using)
+
+#ifdef __cplusplus
+namespace amrex {
+    using UInt128_t = amrex_uint128_t;
+    using  Int128_t = amrex_int128_t;
+}
+#endif
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#endif /* (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) */
+
 #endif
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 506289d03d5..7d84a056a8a 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -4,6 +4,7 @@
 
 #include <AMReX_GpuQualifiers.H>
 #include <AMReX_Extension.H>
+#include <AMReX_INT.H>
 #include <cmath>
 #include <cstdlib>
 #include <type_traits>
@@ -208,6 +209,161 @@ constexpr T powi (T x) noexcept
     }
 }
 
+#if defined(AMREX_INT128_SUPPORTED)
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+std::uint64_t umulhi (std::uint64_t a, std::uint64_t b)
+{
+#if defined(AMREX_USE_SYCL)
+    return sycl::mul_hi(a,b);
+#else
+    AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); ))
+    AMREX_IF_ON_HOST((
+        auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b);
+        return std::uint64_t(tmp >> 64);
+    ))
+#endif
+}
+#endif
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division.
+///
+/// Example:
+///
+///
+///   uint64_t quotient, remainder, dividend, divisor;
+///
+///   FastDivmodU64 divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmodU64
+{
+    std::uint64_t divisor;
+
+#ifdef AMREX_INT128_SUPPORTED
+    std::uint64_t multiplier = 1U;
+    unsigned int shift_right = 0;
+    unsigned int round_up = 0;
+
+    //
+    // Static methods
+    //
+
+    /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
+    static std::uint32_t integer_log2 (std::uint64_t x)
+    {
+        std::uint32_t n = 0;
+        while (x >>= 1) {
+            ++n;
+        }
+        return n;
+    }
+
+    /// Construct the FastDivmod object, in host code only
+    ///
+    /// This precomputes some values based on the divisor and is computationally expensive.
+    FastDivmodU64 (std::uint64_t divisor_)
+        : divisor(divisor_)
+    {
+        if (divisor) {
+            shift_right = integer_log2(divisor);
+
+            if ((divisor & (divisor - 1)) == 0) {
+                multiplier = 0;
+            }
+            else {
+                std::uint64_t power_of_two = (std::uint64_t(1) << shift_right);
+                auto n = amrex::UInt128_t(power_of_two) << 64;
+                std::uint64_t multiplier_lo = n / divisor;
+                n += power_of_two;
+                multiplier = n / divisor;
+                round_up = (multiplier_lo == multiplier ? 1 : 0);
+            }
+        }
+    }
+
+#else
+
+    FastDivmodU64 (std::uint64_t divisor_) : divisor(divisor_) {}
+
+#endif
+
+    /// Returns the quotient of floor(dividend / divisor)
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t divide (std::uint64_t dividend) const
+    {
+#if defined(AMREX_INT128_SUPPORTED)
+        auto x = dividend;
+        if (multiplier) {
+            x = amrex::Math::umulhi(dividend + round_up, multiplier);
+        }
+        return (x >> shift_right);
+#else
+        return dividend / divisor;
+#endif
+    }
+
+    /// Computes the remainder given a computed quotient and dividend
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t modulus (std::uint64_t quotient, std::uint64_t dividend) const
+    {
+        return dividend - quotient * divisor;
+    }
+
+    /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE
+    std::uint64_t divmod (std::uint64_t &remainder, std::uint64_t dividend) const
+    {
+        auto quotient = divide(dividend);
+        remainder = modulus(quotient, dividend);
+        return quotient;
+    }
+
+    /// Computes integer division and modulus using precomputed values. This is computationally
+    /// inexpensive.
+    AMREX_GPU_HOST_DEVICE
+    void operator() (std::uint64_t &quotient, std::uint64_t &remainder, std::uint64_t dividend) const
+    {
+        quotient = divmod(remainder, dividend);
+    }
+};
+
 }
 
 #endif