Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use long integer in GPU kernels #3742

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 18 additions & 29 deletions Src/Base/AMReX_BaseFabUtility.H
Original file line number Diff line number Diff line change
Expand Up @@ -36,37 +36,31 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
"amrex::fill: sizeof(STRUCT) != sizeof(T)*STRUCTSIZE");
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion()) {
const auto lo = amrex::lbound(box);
const auto len = amrex::length(box);
const auto lenxy = len.x*len.y;
const auto lenx = len.x;
int ntotcells = box.numPts();
BoxIndexer indexer(box);
const auto ntotcells = std::uint64_t(box.numPts());
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
int nblocks = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
auto nblocks = int(nblocks_long);
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
T* p = (T*)aos_fab.dataPtr();
#ifdef AMREX_USE_SYCL
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
{
int icell = handler.globalIdx();
unsigned int blockDimx = handler.blockDim();
unsigned int threadIdxx = handler.threadIdx();
unsigned int blockIdxx = handler.blockIdx();
auto const icell = std::uint64_t(handler.globalIdx());
std::uint64_t const blockDimx = handler.blockDim();
std::uint64_t const threadIdxx = handler.threadIdx();
std::uint64_t const blockIdxx = handler.blockIdx();
auto const shared = (T*)handler.sharedMemory();
if (icell < ntotcells) {
auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
handler.sharedBarrier();
for (unsigned int m = threadIdxx,
mend = amrex::min<unsigned int>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
for (std::uint64_t m = threadIdxx,
mend = amrex::min<std::uint64_t>(blockDimx, ntotcells-blockDimx*blockIdxx) * STRUCTSIZE;
m < mend; m += blockDimx) {
p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
}
Expand All @@ -75,24 +69,19 @@ void fill (BaseFab<STRUCT>& aos_fab, F && f)
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept
{
int icell = blockDim.x*blockIdx.x+threadIdx.x;
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
Gpu::SharedMemory<T> gsm;
T* const shared = gsm.dataPtr();
if (icell < ntotcells) {
auto ga = new(shared+threadIdx.x*STRUCTSIZE) STRUCT;
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
int i = (icell - k*lenxy) - j*lenx;
i += lo.x;
j += lo.y;
k += lo.z;
auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
auto [i, j, k] = indexer(icell);
f(*ga, i, j, k);
}
__syncthreads();
for (unsigned int m = threadIdx.x,
mend = amrex::min<unsigned int>(blockDim.x, ntotcells-blockDim.x*blockIdx.x) * STRUCTSIZE;
for (std::uint64_t m = threadIdx.x,
mend = amrex::min<std::uint64_t>(blockDim.x, ntotcells-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
m < mend; m += blockDim.x) {
p[blockDim.x*blockIdx.x*STRUCTSIZE+m] = shared[m];
p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
}
});
#endif
Expand Down
81 changes: 81 additions & 0 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <AMReX_Vector.H>
#include <AMReX_GpuQualifiers.H>
#include <AMReX_GpuControl.H>
#include <AMReX_Math.H>

#include <iosfwd>

Expand Down Expand Up @@ -1835,6 +1836,86 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy
return Box(IntVect(AMREX_D_DECL(i,j,k)),IntVect(AMREX_D_DECL(i,j,k)),typ);
}

struct BoxIndexer
{
#if (AMREX_SPACEDIM == 3)
Math::FastDivmodU64 fdxy;
Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y, z, rem;
fdxy(z, rem, icell);
fdx(y, x, rem);
return {int(x)+lo[0], int(y)+lo[1], int(z)+lo[2]};
}

#elif (AMREX_SPACEDIM == 2)

Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1], 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
std::uint64_t x, y;
fdx(y, x, icell);
return {int(x)+lo[0], int(y)+lo[1]};
}

#elif (AMREX_SPACEDIM == 1)

int lo;

BoxIndexer (Box const& box)
: lo(box.smallEnd(0))
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Dim3 operator() (std::uint64_t icell) const
{
return {int(icell)+lo, 0, 0};
}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
IntVect intVect (std::uint64_t icell) const
{
return IntVect{int(icell)+lo};
}

#endif
};

}

#endif /*AMREX_BOX_H*/
1 change: 1 addition & 0 deletions Src/Base/AMReX_GpuLaunch.H
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <AMReX_GpuLaunchGlobal.H>
#include <AMReX_RandomEngine.H>
#include <AMReX_Algorithm.H>
#include <AMReX_Math.H>
#include <cstddef>
#include <limits>
#include <algorithm>
Expand Down
Loading
Loading