Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_HOST #3591

Merged
merged 31 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
fbc67d5
use AMREX_IF_ON_DEVICE
BenWibking Oct 10, 2023
02ee118
fix macro usage
BenWibking Oct 14, 2023
b281824
fix typos
BenWibking Oct 14, 2023
209c960
use double parentheses for macro args
BenWibking Oct 14, 2023
a322241
workaround pragma omp atomic restrictions
BenWibking Oct 14, 2023
ce5c94d
convert CLZ functions to use macros
BenWibking Oct 14, 2023
7c64ca8
update GPU doc
BenWibking Oct 14, 2023
470c3cd
handle all cases in CLZ
BenWibking Oct 14, 2023
615bffc
avoid static local device var
BenWibking Oct 15, 2023
5471bc7
fix __CUDA_ARCH__ ifdefs
BenWibking Oct 15, 2023
aa02932
fix trailing whitespace
BenWibking Oct 15, 2023
f3bf52f
Merge branch 'development' into replace-cuda-arch
BenWibking Oct 20, 2023
6466ad2
Update Src/Base/AMReX_GpuQualifiers.H
BenWibking Oct 21, 2023
10ea8bb
fix CLZ on host
BenWibking Oct 22, 2023
e448b35
simpler pattern for SYCL_DEVICE/other device/host
BenWibking Oct 22, 2023
009f66a
fix GpuRange SYCL_DEVICE/other device/host pattern
BenWibking Oct 22, 2023
3668c36
fix Random SYCL_DEVICE/other device/host
BenWibking Oct 22, 2023
a89692e
avoid __CUDA_ARCH__ in Math
BenWibking Oct 22, 2023
36ecb44
restore AMREX_EXPORT
BenWibking Oct 22, 2023
b98d4b3
remove trailing whitespace
BenWibking Oct 22, 2023
9b7cd2c
Merge branch 'development' into replace-cuda-arch
BenWibking Oct 24, 2023
7c2c9b6
fix remaining uses of __CUDA_ARCH__
BenWibking Oct 24, 2023
34f9cbc
Simplify clz
WeiqunZhang Nov 2, 2023
98f29ed
Need an extra declaration
WeiqunZhang Nov 2, 2023
0ec2ff1
Fix default parameter
WeiqunZhang Nov 2, 2023
1f1850f
Update Src/Base/AMReX_GpuAtomic.H
WeiqunZhang Nov 6, 2023
891f167
Apply suggestions from code review
WeiqunZhang Nov 6, 2023
8f2ef81
Update Src/Base/AMReX_RealVect.H
WeiqunZhang Nov 7, 2023
f0ae8bd
Revert back to use GNU extension if available
WeiqunZhang Nov 7, 2023
a43b5bb
Merge branch 'replace-cuda-arch' of github.com:BenWibking/amrex into …
WeiqunZhang Nov 7, 2023
2946aef
Add back static, which somehow got lost
WeiqunZhang Nov 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions Docs/sphinx_documentation/source/GPU.rst
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,10 @@ GPU support.
When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``,
``AMREX_USE_OMP_OFFLOAD`` is defined.

In addition to AMReX's preprocessor macros, CUDA provides the
``__CUDA_ARCH__`` macro which is only defined when in device code.
HIP and Sycl provide similar macros.
``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__``
function requires separate code for the CPU and GPU implementations.
The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and
``AMREX_IF_ON_HOST((code_for_host))`` should be used when a
``__host__ __device__`` function requires separate code for the
CPU and GPU implementations.

.. ===================================================================

Expand Down
60 changes: 27 additions & 33 deletions Src/Base/AMReX.H
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,15 @@ namespace amrex

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void Error (const char* msg = nullptr) {
#if AMREX_DEVICE_COMPILE
#if defined(NDEBUG)
amrex::ignore_unused(msg);
AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
#else
if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
AMREX_DEVICE_ASSERT(0);
#endif
#else
Error_host("Error", msg);
AMREX_IF_ON_DEVICE((
if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); }
AMREX_DEVICE_ASSERT(0);
))
#endif
AMREX_IF_ON_HOST((Error_host("Error", msg);))
}

//! Print out warning message to cerr.
Expand All @@ -132,32 +131,28 @@ namespace amrex

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void Warning (const char * msg) {
#if AMREX_DEVICE_COMPILE
#if defined(NDEBUG)
amrex::ignore_unused(msg);
#else
if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }
#endif
AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
#else
Warning_host(msg);
AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); }))
#endif
AMREX_IF_ON_HOST((Warning_host(msg);))
}

//! Print out message to cerr and exit via abort().
void Abort (const std::string& msg);

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void Abort (const char * msg = nullptr) {
#if AMREX_DEVICE_COMPILE
#if defined(NDEBUG)
amrex::ignore_unused(msg);
AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);))
#else
if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
AMREX_DEVICE_ASSERT(0);
#endif
#else
Error_host("Abort", msg);
AMREX_IF_ON_DEVICE((
if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); }
AMREX_DEVICE_ASSERT(0);
))
#endif
AMREX_IF_ON_HOST((Error_host("Abort", msg);))
}

/**
Expand All @@ -170,22 +165,21 @@ namespace amrex

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) {
#if AMREX_DEVICE_COMPILE
#if defined(NDEBUG)
amrex::ignore_unused(EX,file,line,msg);
#else
if (msg) {
AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
EX, file, line, msg);
} else {
AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
EX, file, line);
}
AMREX_DEVICE_ASSERT(0);
#endif
AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);))
#else
Assert_host(EX,file,line,msg);
AMREX_IF_ON_DEVICE((
if (msg) {
AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s",
EX, file, line, msg);
} else {
AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d",
EX, file, line);
}
AMREX_DEVICE_ASSERT(0);
))
#endif
AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);))
}

/**
Expand Down
180 changes: 99 additions & 81 deletions Src/Base/AMReX_Algorithm.H
Original file line number Diff line number Diff line change
Expand Up @@ -161,51 +161,52 @@ namespace amrex
AMREX_GPU_HOST_DEVICE
ItType upper_bound (ItType first, ItType last, const ValType& val)
{
#if AMREX_DEVICE_COMPILE
std::ptrdiff_t count = last-first;
while(count>0){
auto it = first;
const auto step = count/2;
it += step;
if (!(val < *it)){
first = ++it;
count -= step + 1;
AMREX_IF_ON_DEVICE((
std::ptrdiff_t count = last-first;
while(count>0){
auto it = first;
const auto step = count/2;
it += step;
if (!(val < *it)){
first = ++it;
count -= step + 1;
}
else{
count = step;
}
}
else{
count = step;
}
}

return first;
#else
return std::upper_bound(first, last, val);
#endif
return first;
))
AMREX_IF_ON_HOST((
return std::upper_bound(first, last, val);
))
}

template<typename ItType, typename ValType>
AMREX_GPU_HOST_DEVICE
ItType lower_bound (ItType first, ItType last, const ValType& val)
{
#ifdef AMREX_DEVICE_COMPILE
std::ptrdiff_t count = last-first;
while(count>0)
{
auto it = first;
const auto step = count/2;
it += step;
if (*it < val){
first = ++it;
count -= step + 1;
}
else{
count = step;
AMREX_IF_ON_DEVICE((
std::ptrdiff_t count = last-first;
while(count>0)
{
auto it = first;
const auto step = count/2;
it += step;
if (*it < val){
first = ++it;
count -= step + 1;
}
else{
count = step;
}
}
}

return first;
#else
return std::lower_bound(first, last, val);
#endif
return first;
))
AMREX_IF_ON_HOST((
return std::lower_bound(first, last, val);
))
}

namespace detail {
Expand Down Expand Up @@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept
return static_cast<int>(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT));
}

#ifdef AMREX_USE_CUDA

// likewise with CUDA, there are __clz functions that take (signed) int and long long int
template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
int clz_wrapper (clz_tag, T x) noexcept
{
return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
}

template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
int clz_wrapper (clzll_tag, T x) noexcept
{
return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
}
#endif

}
template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t> ||
std::is_same_v<std::decay_t<T>,std::uint16_t> ||
std::is_same_v<std::decay_t<T>,std::uint32_t> ||
std::is_same_v<std::decay_t<T>,std::uint64_t>, int> = 0>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (T x) noexcept;

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (std::uint8_t x) noexcept
int clz_generic (std::uint8_t x) noexcept
{
#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
return detail::clz_wrapper(detail::clz_tag{}, x);
#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
#else
static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
auto upper = x >> 4;
auto lower = x & 0xF;
return upper ? clz_lookup[upper] : 4 + clz_lookup[lower];
#endif
}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (std::uint16_t x) noexcept
int clz_generic (std::uint16_t x) noexcept
{
#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
return detail::clz_wrapper(detail::clz_tag{}, x);
#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
#else
auto upper = std::uint8_t(x >> 8);
auto lower = std::uint8_t(x & 0xFF);
return upper ? clz(upper) : 8 + clz(lower);
#endif
}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (std::uint32_t x) noexcept
int clz_generic (std::uint32_t x) noexcept
{
#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
return detail::clz_wrapper(detail::clz_tag{}, x);
#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
#else
auto upper = std::uint16_t(x >> 16);
auto lower = std::uint16_t(x & 0xFFFF);
return upper ? clz(upper) : 16 + clz(lower);
#endif
}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (std::uint64_t x) noexcept
int clz_generic (std::uint64_t x) noexcept
{
#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz
return detail::clz_wrapper(detail::clz_tag{}, x);
#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
#else
auto upper = std::uint32_t(x >> 32);
auto lower = std::uint32_t(x & 0xFFFFFFFF);
return upper ? clz(upper) : 32 + clz(lower);
}

#if defined AMREX_USE_CUDA

namespace detail {
// likewise with CUDA, there are __clz functions that take (signed) int and long long int
template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(int)>::type>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
int clz_wrapper (clz_tag, T x) noexcept
{
return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
}

template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(long long int)>::type>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
int clz_wrapper (clzll_tag, T x) noexcept
{
return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
}
}

template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t> ||
std::is_same_v<std::decay_t<T>,std::uint16_t> ||
std::is_same_v<std::decay_t<T>,std::uint32_t> ||
std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (T x) noexcept
{
AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);))
#if AMREX_HAS_BUILTIN_CLZ
AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);))
#else
AMREX_IF_ON_HOST((return clz_generic(x);))
#endif
}

#else // !defined AMREX_USE_CUDA

template <class T, typename std::enable_if_t<std::is_same_v<std::decay_t<T>,std::uint8_t> ||
std::is_same_v<std::decay_t<T>,std::uint16_t> ||
std::is_same_v<std::decay_t<T>,std::uint32_t> ||
std::is_same_v<std::decay_t<T>,std::uint64_t>, int> >
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int clz (T x) noexcept
{
#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ)
return detail::builtin_clz_wrapper(detail::clz_tag{}, x);
#else
return clz_generic(x);
#endif
}

#endif // defined AMREX_USE_CUDA

}

#endif
Loading
Loading