diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index 3dafadcd74b..30af9c9cfc0 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -36,37 +36,31 @@ void fill (BaseFab& aos_fab, F && f) "amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE"); #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; - int ntotcells = box.numPts(); + BoxIndexer indexer(box); + const auto ntotcells = std::uint64_t(box.numPts()); int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128; - int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block; + std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block; + AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits::max())); + auto nblocks = int(nblocks_long); std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT); T* p = (T*)aos_fab.dataPtr(); #ifdef AMREX_USE_SYCL amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept { - int icell = handler.globalIdx(); - unsigned int blockDimx = handler.blockDim(); - unsigned int threadIdxx = handler.threadIdx(); - unsigned int blockIdxx = handler.blockIdx(); + auto const icell = std::uint64_t(handler.globalIdx()); + std::uint64_t const blockDimx = handler.blockDim(); + std::uint64_t const threadIdxx = handler.threadIdx(); + std::uint64_t const blockIdxx = handler.blockIdx(); auto const shared = (T*)handler.sharedMemory(); if (icell < ntotcells) { auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } handler.sharedBarrier(); - for (unsigned int m = threadIdxx, - mend = amrex::min(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE; + for (std::uint64_t m = threadIdxx, + mend = amrex::min(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE; m < mend; m += blockDimx) { p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m]; } @@ -75,24 +69,19 @@ void fill (BaseFab& aos_fab, F && f) amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int icell = blockDim.x*blockIdx.x+threadIdx.x; + std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; Gpu::SharedMemory gsm; T* const shared = gsm.dataPtr(); if (icell < ntotcells) { - auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT; + auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } __syncthreads(); - for (unsigned int m = threadIdx.x, - mend = amrex::min(blockDim.x, ntotcells-blockDim.x*blockIdx.x) * STRUCTSIZE; + for (std::uint64_t m = threadIdx.x, + mend = amrex::min(blockDim.x, ntotcells-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; m < mend; m += blockDim.x) { - p[blockDim.x*blockIdx.x*STRUCTSIZE+m] = shared[m]; + p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m]; } }); #endif diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H index 0a32d637d4f..33d0716a86a 100644 --- a/Src/Base/AMReX_Box.H +++ b/Src/Base/AMReX_Box.H @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -1835,6 +1836,86 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ); } +struct BoxIndexer +{ +#if (AMREX_SPACEDIM == 3) + Math::FastDivmodU64 fdxy; + Math::FastDivmodU64 fdx; + IntVect lo; + + BoxIndexer (Box const& box) + : fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))), + fdx (std::uint64_t(box.length(0))), + lo (box.smallEnd()) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + std::uint64_t x, y, z, rem; + fdxy(z, rem, icell); + fdx(y, x, rem); + return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + std::uint64_t x, y, z, rem; + fdxy(z, rem, icell); + fdx(y, x, rem); + return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]}; + } + +#elif (AMREX_SPACEDIM == 2) + + Math::FastDivmodU64 fdx; + IntVect lo; + + BoxIndexer (Box const& box) + : fdx (std::uint64_t(box.length(0))), + lo (box.smallEnd()) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + std::uint64_t x, y; + fdx(y, x, icell); + return {int(x)+lo[0], int(y)+lo[1], 0}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + std::uint64_t x, y; + fdx(y, x, icell); + return {int(x)+lo[0], int(y)+lo[1]}; + } + +#elif (AMREX_SPACEDIM == 1) + + int lo; + + BoxIndexer (Box const& box) + : lo(box.smallEnd(0)) + {} + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + Dim3 operator() (std::uint64_t icell) const + { + return {int(icell)+lo, 0, 0}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + IntVect intVect (std::uint64_t icell) const + { + return IntVect{int(icell)+lo}; + } + +#endif +}; + } #endif /*AMREX_BOX_H*/ diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index c4ba7dd86bc..ed61eadc13b 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 78e9e856535..a280968c773 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -23,8 +23,8 @@ template void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, gpuStream_t stream, L&& f) noexcept { - int nthreads_total = nthreads_per_block * nblocks; - std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) + const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks; + const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) / sizeof(unsigned long long); auto& q = *(stream.queue); try { @@ -47,7 +47,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, template void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept { - int nthreads_total = nthreads_per_block * nblocks; + const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks; auto& q = *(stream.queue); try { q.submit([&] (sycl::handler& h) { @@ -68,8 +68,8 @@ template void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, L&& f) noexcept { - int nthreads_total = MT * nblocks; - std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) + const auto nthreads_total = MT * std::size_t(nblocks); + const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) / sizeof(unsigned long long); auto& q = *(stream.queue); try { @@ -93,7 +93,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, template void launch (int nblocks, gpuStream_t stream, L&& f) noexcept { - int nthreads_total = MT * nblocks; + const auto nthreads_total = MT * std::size_t(nblocks); auto& q = *(stream.queue); try { q.submit([&] (sycl::handler& h) { @@ -116,8 +116,8 @@ void launch (T const& n, L&& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -192,8 +192,8 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::makeExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -206,11 +206,11 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (T i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { - int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0), - (T)item.get_local_range(0)); - detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), + for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0); + i < std::size_t(n); i += stride) { + int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0), + item.get_local_range(0)); + detail::call_f(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } }); @@ -223,9 +223,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (T i = item.get_global_id(0), stride = item.get_global_range(0); - i < n; i += stride) { - detail::call_f(f, i, Gpu::Handler{&item}); + for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0); + i < std::size_t(n); i += stride) { + detail::call_f(f, T(i), Gpu::Handler{&item}); } }); }); @@ -239,14 +239,11 @@ template void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -259,16 +256,11 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), - (int)item.get_local_range(0)); + auto [i, j, k] = indexer(icell); + int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); } @@ -282,14 +274,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,Gpu::Handler{&item}); } }); @@ -304,14 +291,11 @@ template (ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { if (info.hasReduction()) { @@ -324,16 +308,11 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - int n_active_threads = amrex::min(ncells-icell+(int)item.get_local_id(0), - (int)item.get_local_range(0)); + auto [i, j, k] = indexer(icell); + int n_active_threads = amrex::min(ncells-icell+std::uint64_t(item.get_local_id(0)), + std::uint64_t(item.get_local_range(0))); detail::call_f(f, i, j, k, ncomp, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), n_active_threads}); @@ -348,14 +327,9 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto [i, j, k] = indexer(icell); detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item}); } }); @@ -371,8 +345,8 @@ void ParallelForRNG (T n, L&& f) noexcept { if (amrex::isEmpty(n)) { return; } const auto ec = Gpu::ExecutionConfig(n); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); auto& engdescr = *(getRandEngineDescriptor()); try { @@ -384,11 +358,11 @@ void ParallelForRNG (T n, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - int tid = item.get_global_id(0); + auto const tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (T i = tid, stride = item.get_global_range(0); i < n; i += stride) { - f(i,rand_eng); + for (std::size_t i = tid, stride = item.get_global_range(0); i < std::size_t(n); i += stride) { + f(T(i),rand_eng); } engine_acc.store(engine, tid); }); @@ -403,14 +377,11 @@ template void ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * amrex::min(ec.numBlocks.x,Gpu::Device::maxBlocksPerLaunch()); auto& q = Gpu::Device::streamQueue(); auto& engdescr = *(getRandEngineDescriptor()); try { @@ -422,17 +393,12 @@ void ParallelForRNG (Box const& box, L&& f) noexcept [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - int tid = item.get_global_id(0); + auto const tid = item.get_global_id(0); auto engine = engine_acc.load(tid); RandomEngine rand_eng{&engine}; - for (int icell = tid, stride = item.get_global_range(0); + for (std::uint64_t icell = tid, stride = item.get_global_range(0); icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + auto [i, j, k] = indexer(icell); f(i,j,k,rand_eng); } engine_acc.store(engine, tid); @@ -448,14 +414,11 @@ template void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells = amrex::max(ncells1, ncells2); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -518,24 +470,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); f1(i,j,k); } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); f2(i,j,k); } } @@ -552,25 +494,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); + const auto ncells = amrex::max(ncells1, ncells2, ncells3); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -580,33 +513,18 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); f1(i,j,k); } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); f2(i,j,k); } if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + auto [i, j, k] = indexer3(icell); f3(i,j,k); } } @@ -625,20 +543,14 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells = amrex::max(ncells1, ncells2); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -648,26 +560,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } @@ -690,25 +592,16 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); + const auto ncells = amrex::max(ncells1, ncells2, ncells3); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); - int nthreads_per_block = ec.numThreads.x; - int nthreads_total = nthreads_per_block * ec.numBlocks.x; + const auto nthreads_per_block = ec.numThreads.x; + const auto nthreads_total = std::size_t(nthreads_per_block) * ec.numBlocks.x; auto& q = Gpu::Device::streamQueue(); try { q.submit([&] (sycl::handler& h) { @@ -718,37 +611,22 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, [[sycl::reqd_work_group_size(1,1,MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - for (int icell = item.get_global_id(0), stride = item.get_global_range(0); + for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0); icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); } @@ -822,7 +700,7 @@ void launch (T const& n, L&& f) noexcept namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, N i, N /*nleft*/) + auto call_f (F const& f, N i, std::uint64_t /*nleft*/) noexcept -> decltype(f(0)) { f(i); @@ -830,15 +708,15 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, N i, N nleft) + auto call_f (F const& f, N i, std::uint64_t nleft) noexcept -> decltype(f(0,Gpu::Handler{})) { - f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x))); + f(i,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, int /*nleft*/) + auto call_f (F const& f, int i, int j, int k, std::uint64_t /*nleft*/) noexcept -> decltype(f(0,0,0)) { f(i,j,k); @@ -846,15 +724,15 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, int nleft) + auto call_f (F const& f, int i, int j, int k, std::uint64_t nleft) noexcept -> decltype(f(0,0,0,Gpu::Handler{})) { - f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + f(i,j,k,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/) + auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t /*nleft*/) noexcept -> decltype(f(0,0,0,0)) { for (T n = 0; n < ncomp; ++n) f(i,j,k,n); @@ -862,10 +740,10 @@ namespace detail { template AMREX_GPU_DEVICE - auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft) + auto call_f (F const& f, int i, int j, int k, T ncomp, std::uint64_t nleft) noexcept -> decltype(f(0,0,0,0,Gpu::Handler{})) { - for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x))); + for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(std::uint64_t)blockDim.x))); } } @@ -877,9 +755,9 @@ ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept const auto ec = Gpu::makeExecutionConfig(n); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; - i < n; i += stride) { - detail::call_f(f, i, (n-i+(T)threadIdx.x)); + for (Long i = Long(blockDim.x)*blockIdx.x+threadIdx.x, stride = Long(blockDim.x)*gridDim.x; + i < Long(n); i += stride) { + detail::call_f(f, T(i), (Long(n)-i+(Long)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -890,24 +768,16 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x)); + auto [i, j, k] = indexer(icell); + detail::call_f(f, i, j, k, (ncells-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -918,23 +788,15 @@ std::enable_if_t::value> ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x)); + auto [i, j, k] = indexer(icell); + detail::call_f(f, i, j, k, ncomp, (ncells-icell+(std::uint64_t)threadIdx.x)); } }); AMREX_GPU_ERROR_CHECK(); @@ -951,10 +813,10 @@ ParallelForRNG (T n, L&& f) noexcept amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (T i = tid, stride = blockDim.x*gridDim.x; i < n; i += stride) { - f(i,engine); + for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) { + f(T(i),engine); } }); Gpu::streamSynchronize(); // To avoid multiple streams using RNG @@ -967,25 +829,17 @@ ParallelForRNG (Box const& box, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { + auto [i, j, k] = indexer(icell); f(i,j,k,engine); } }); @@ -999,25 +853,17 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept { if (amrex::isEmpty(box)) { return; } randState_t* rand_state = getRandState(); - int ncells = box.numPts(); - const auto lo = amrex::lbound(box); - const auto len = amrex::length(box); - const auto lenxy = len.x*len.y; - const auto lenx = len.x; + const auto ncells = std::uint64_t(box.numPts()); + const BoxIndexer indexer(box); const auto ec = Gpu::ExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()), ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - int tid = blockDim.x*blockIdx.x+threadIdx.x; + auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; RandomEngine engine{&(rand_state[tid])}; - for (int icell = tid, stride = blockDim.x*gridDim.x; icell < ncells; icell += stride) { - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; + for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { + auto [i, j, k] = indexer(icell); for (T n = 0; n < ncomp; ++n) { f(i,j,k,n,engine); } @@ -1033,38 +879,22 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells = amrex::max(ncells1, ncells2); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); f1(i,j,k); } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); f2(i,j,k); } } @@ -1079,52 +909,28 @@ ParallelFor (Gpu::KernelInfo const&, L1&& f1, L2&& f2, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); + const auto ncells = amrex::max(ncells1, ncells2, ncells3); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); f1(i,j,k); } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); f2(i,j,k); } if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + auto [i, j, k] = indexer3(icell); f3(i,j,k); } } @@ -1141,40 +947,24 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box2, T2 ncomp2, L2&& f2) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells = amrex::max(ncells1, ncells2); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len1x = len1.x; - const auto len2x = len2.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells = amrex::max(ncells1, ncells2); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } @@ -1195,56 +985,32 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box3, T3 ncomp3, L3&& f3) noexcept { if (amrex::isEmpty(box1) && amrex::isEmpty(box2) && amrex::isEmpty(box3)) { return; } - int ncells1 = box1.numPts(); - int ncells2 = box2.numPts(); - int ncells3 = box3.numPts(); - int ncells = amrex::max(ncells1, ncells2, ncells3); - const auto lo1 = amrex::lbound(box1); - const auto lo2 = amrex::lbound(box2); - const auto lo3 = amrex::lbound(box3); - const auto len1 = amrex::length(box1); - const auto len2 = amrex::length(box2); - const auto len3 = amrex::length(box3); - const auto len1xy = len1.x*len1.y; - const auto len2xy = len2.x*len2.y; - const auto len3xy = len3.x*len3.y; - const auto len1x = len1.x; - const auto len2x = len2.x; - const auto len3x = len3.x; + const auto ncells1 = std::uint64_t(box1.numPts()); + const auto ncells2 = std::uint64_t(box2.numPts()); + const auto ncells3 = std::uint64_t(box3.numPts()); + const auto ncells = amrex::max(ncells1, ncells2, ncells3); + const BoxIndexer indexer1(box1); + const BoxIndexer indexer2(box2); + const BoxIndexer indexer3(box3); const auto ec = Gpu::makeExecutionConfig(ncells); AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { - for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x; + for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < ncells; icell += stride) { if (icell < ncells1) { - int k = icell / len1xy; - int j = (icell - k*len1xy) / len1x; - int i = (icell - k*len1xy) - j*len1x; - i += lo1.x; - j += lo1.y; - k += lo1.z; + auto [i, j, k] = indexer1(icell); for (T1 n = 0; n < ncomp1; ++n) { f1(i,j,k,n); } } if (icell < ncells2) { - int k = icell / len2xy; - int j = (icell - k*len2xy) / len2x; - int i = (icell - k*len2xy) - j*len2x; - i += lo2.x; - j += lo2.y; - k += lo2.z; + auto [i, j, k] = indexer2(icell); for (T2 n = 0; n < ncomp2; ++n) { f2(i,j,k,n); } } if (icell < ncells3) { - int k = icell / len3xy; - int j = (icell - k*len3xy) / len3x; - int i = (icell - k*len3xy) - j*len3x; - i += lo3.x; - j += lo3.y; - k += lo3.z; + auto [i, j, k] = indexer3(icell); for (T3 n = 0; n < ncomp3; ++n) { f3(i,j,k,n); } diff --git a/Src/Base/AMReX_INT.H b/Src/Base/AMReX_INT.H index f8ab0e9ba8f..f54852cb64b 100644 --- a/Src/Base/AMReX_INT.H +++ b/Src/Base/AMReX_INT.H @@ -31,4 +31,29 @@ namespace amrex { } #endif +#if (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) + +#define AMREX_INT128_SUPPORTED 1 + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +typedef unsigned __int128 amrex_uint128_t; // NOLINT(modernize-use-using) +typedef __int128 amrex_int128_t; // NOLINT(modernize-use-using) + +#ifdef __cplusplus +namespace amrex { + using UInt128_t = amrex_uint128_t; + using Int128_t = amrex_int128_t; +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#endif /* (defined(__x86_64) || defined (__aarch64__)) && !defined(_WIN32) && (defined(__GNUC__) || defined(__clang__)) */ + #endif diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H index 506289d03d5..7d84a056a8a 100644 --- a/Src/Base/AMReX_Math.H +++ b/Src/Base/AMReX_Math.H @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -208,6 +209,161 @@ constexpr T powi (T x) noexcept } } +#if defined(AMREX_INT128_SUPPORTED) +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +std::uint64_t umulhi (std::uint64_t a, std::uint64_t b) +{ +#if defined(AMREX_USE_SYCL) + return sycl::mul_hi(a,b); +#else + AMREX_IF_ON_DEVICE(( return __umul64hi(a, b); )) + AMREX_IF_ON_HOST(( + auto tmp = amrex::UInt128_t(a) * amrex::UInt128_t(b); + return std::uint64_t(tmp >> 64); + )) +#endif +} +#endif + +/*************************************************************************************************** + * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Object to encapsulate the fast division+modulus operation for 64b integer division. +/// +/// Example: +/// +/// +/// uint64_t quotient, remainder, dividend, divisor; +/// +/// FastDivmodU64 divmod(divisor); +/// +/// divmod(quotient, remainder, dividend); +/// +/// // quotient = (dividend / divisor) +/// // remainder = (dividend % divisor) +/// +struct FastDivmodU64 +{ + std::uint64_t divisor; + +#ifdef AMREX_INT128_SUPPORTED + std::uint64_t multiplier = 1U; + unsigned int shift_right = 0; + unsigned int round_up = 0; + + // + // Static methods + // + + /// Computes b, where 2^b is the greatest power of two that is less than or equal to x + static std::uint32_t integer_log2 (std::uint64_t x) + { + std::uint32_t n = 0; + while (x >>= 1) { + ++n; + } + return n; + } + + /// Construct the FastDivmod object, in host code only + /// + /// This precomputes some values based on the divisor and is computationally expensive. + FastDivmodU64 (std::uint64_t divisor_) + : divisor(divisor_) + { + if (divisor) { + shift_right = integer_log2(divisor); + + if ((divisor & (divisor - 1)) == 0) { + multiplier = 0; + } + else { + std::uint64_t power_of_two = (std::uint64_t(1) << shift_right); + auto n = amrex::UInt128_t(power_of_two) << 64; + std::uint64_t multiplier_lo = n / divisor; + n += power_of_two; + multiplier = n / divisor; + round_up = (multiplier_lo == multiplier ? 1 : 0); + } + } + } + +#else + + FastDivmodU64 (std::uint64_t divisor_) : divisor(divisor_) {} + +#endif + + /// Returns the quotient of floor(dividend / divisor) + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t divide (std::uint64_t dividend) const + { +#if defined(AMREX_INT128_SUPPORTED) + auto x = dividend; + if (multiplier) { + x = amrex::Math::umulhi(dividend + round_up, multiplier); + } + return (x >> shift_right); +#else + return dividend / divisor; +#endif + } + + /// Computes the remainder given a computed quotient and dividend + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t modulus (std::uint64_t quotient, std::uint64_t dividend) const + { + return dividend - quotient * divisor; + } + + /// Returns the quotient of floor(dividend / divisor) and computes the remainder + [[nodiscard]] AMREX_GPU_HOST_DEVICE + std::uint64_t divmod (std::uint64_t &remainder, std::uint64_t dividend) const + { + auto quotient = divide(dividend); + remainder = modulus(quotient, dividend); + return quotient; + } + + /// Computes integer division and modulus using precomputed values. This is computationally + /// inexpensive. + AMREX_GPU_HOST_DEVICE + void operator() (std::uint64_t "ient, std::uint64_t &remainder, std::uint64_t dividend) const + { + quotient = divmod(remainder, dividend); + } +}; + } #endif