Skip to content

Commit

Permalink
Update utils
Browse files Browse the repository at this point in the history
  • Loading branch information
chillenzer committed Nov 8, 2024
1 parent a6d7a9c commit 99bcf21
Showing 1 changed file with 82 additions and 161 deletions.
243 changes: 82 additions & 161 deletions src/include/mallocMC/mallocMC_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
Copyright (C) 2012 Institute for Computer Graphics and Vision,
Graz University of Technology
Copyright (C) 2014 Institute of Radiation Physics,
Copyright (C) 2014-2024 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at
Michael Kenzel - kenzel ( at ) icg.tugraz.at
Carlchristian Eckert - c.eckert ( at ) hzdr.de
Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -34,16 +35,14 @@
#pragma once

#include <alpaka/alpaka.hpp>
#include <alpaka/core/Common.hpp>
#include <sys/types.h>

#ifdef _MSC_VER
# include <intrin.h>
#endif

#include <atomic>
#include <cstdint>
#include <sstream>
#include <stdexcept>
#include <string>
#include <type_traits>

/* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__
Expand All @@ -56,38 +55,25 @@

namespace mallocMC
{
template<int PSIZE>
class __PointerEquivalent
{
public:
using type = unsigned int;
};
template<>
class __PointerEquivalent<8>
{
public:
using type = unsigned long long;
};

#if defined(__CUDA_ARCH__)
constexpr auto warpSize = 32; // TODO
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
// defined:
// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400
// overview wave front size:
// https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115
// gfx10XX has 32 threads per wavefront else 64
template<typename TAcc>
constexpr uint32_t warpSize = 1U;

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template<typename TDim, typename TIdx>
constexpr uint32_t warpSize<alpaka::AccGpuCudaRt<TDim, TIdx>> = 32U;
#endif

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
# if(HIP_VERSION_MAJOR >= 4)
constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE;
template<typename TDim, typename TIdx>
constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = __AMDGCN_WAVEFRONT_SIZE;
# else
constexpr auto warpSize = 64;
template<typename TDim, typename TIdx>
constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = 64;
# endif
#else
constexpr auto warpSize = 1;
#endif

using PointerEquivalent = mallocMC::__PointerEquivalent<sizeof(char*)>::type;

ALPAKA_FN_ACC inline auto laneid()
{
#if defined(__CUDA_ARCH__)
Expand All @@ -97,7 +83,7 @@ namespace mallocMC
#elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__)
return __lane_id();
#else
return 0u;
return 0U;
#endif
}

Expand All @@ -108,83 +94,87 @@ namespace mallocMC
*
* @return current index of the warp
*/
ALPAKA_FN_ACC inline auto warpid()
template<typename TAcc>
ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t
{
#if defined(__CUDA_ARCH__)
std::uint32_t mywarpid;
return 0U;
}

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template<typename TDim, typename TIdx>
// ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is
// specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.`
__device__ inline auto warpid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
{
std::uint32_t mywarpid = 0;
asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid));
return mywarpid;
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
}
#endif

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
template<typename TDim, typename TIdx>
ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
{
// get wave id
// https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024
return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4));
#else
return 0u;
}
#endif

template<typename TAcc>
ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t
{
return 0U;
}

ALPAKA_FN_ACC inline auto smid()
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template<typename TDim, typename TIdx>
ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
{
#if defined(__CUDA_ARCH__)
std::uint32_t mysmid;
std::uint32_t mysmid = 0;
asm("mov.u32 %0, %%smid;" : "=r"(mysmid));
return mysmid;
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
return __smid();
#else
return 0u;
}
#endif

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
template<typename TDim, typename TIdx>
ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
{
return __smid();
}
#endif

ALPAKA_FN_ACC inline auto lanemask_lt()
template<typename TAcc>
ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/)
{
return 0U;
}
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template<typename TDim, typename TIdx>
ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/)
{
#if defined(__CUDA_ARCH__)
std::uint32_t lanemask;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask));
return lanemask;
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
return __lanemask_lt();
#else
return 0u;
#endif
}

ALPAKA_FN_ACC inline auto ballot(int pred)
{
#if defined(__CUDA_ARCH__)
return __ballot_sync(__activemask(), pred);
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
// return value is 64bit for HIP-clang
return __ballot(pred);
#else
return 1u;
#endif
}


ALPAKA_FN_ACC inline auto activemask()
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
template<typename TDim, typename TIdx>
ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/)
{
#if defined(__CUDA_ARCH__)
return __activemask();
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
// return value is 64bit for HIP-clang
return ballot(1);
#else
return 1u;
#endif
return __lanemask_lt();
}
#endif

template<class T>
ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T
{
return (a + b - 1) / b;
}

/** the maximal number threads per block, valid for sm_2.X - sm_7.5
*
* https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
*/
constexpr uint32_t maxThreadsPerBlock = 1024;
constexpr uint32_t maxThreadsPerBlock = 1024U;

/** warp id within a cuda block
*
Expand All @@ -199,96 +189,27 @@ namespace mallocMC
const auto localId = alpaka::mapIdx<1>(
alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc),
alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc))[0];
return localId / warpSize;
}

template<typename T>
ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t
{
#if defined(__CUDA_ARCH__)
return ::__ffs(mask);
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
// return value is 64bit for HIP-clang
return ::__ffsll(static_cast<unsigned long long int>(mask));
#else
if(mask == 0)
return 0;
auto i = 1u;
while((mask & 1) == 0)
{
mask >>= 1;
i++;
}
return i;
#endif
return localId / warpSize<AlpakaAcc>;
}

template<typename T>
ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t
template<typename T, typename U, typename = std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>>
ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T
{
#if defined(__CUDA_ARCH__)
return ::__popc(mask);
#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
// return value is 64bit for HIP-clang
return ::__popcll(static_cast<unsigned long long int>(mask));
#else
// cf.
// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
std::uint32_t count = 0;
while(mask)
{
count++;
mask &= mask - 1;
}
return count;
#endif
return (numerator + (denominator - 1)) / denominator;
}

// Threadfence implementations will maybe moved later into alpaka
template<typename T_Acc, typename T_Sfinae = void>
struct ThreadFence
{
// CPU only implementation
static void device()
{
std::atomic_thread_fence(std::memory_order_seq_cst);
}

static void block()
{
std::atomic_thread_fence(std::memory_order_seq_cst);
}
};

template<typename... T_AccArgs>
struct ThreadFence<alpaka::AccGpuUniformCudaHipRt<T_AccArgs...>, void>
{
static ALPAKA_FN_ACC void device()
{
#if MALLOCMC_DEVICE_COMPILE
__threadfence();
#endif
}

static ALPAKA_FN_ACC void block()
{
#if MALLOCMC_DEVICE_COMPILE
__threadfence_block();
#endif
}
};

ALPAKA_NO_HOST_ACC_WARNING
template<typename T_Acc>
ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc)
template<typename T_size>
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf(
void const* const pointer,
void const* const start,
T_size const stepSize) -> std::make_signed_t<T_size>
{
ThreadFence<T_Acc>::device();
return std::distance(reinterpret_cast<char const*>(start), reinterpret_cast<char const*>(pointer)) / stepSize;
}

ALPAKA_NO_HOST_ACC_WARNING
template<typename T_Acc>
ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc)
template<typename TAcc, typename T>
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target)
{
ThreadFence<T_Acc>::block();
return alpaka::atomicCas(acc, &target, static_cast<T>(0U), static_cast<T>(0U));
}
} // namespace mallocMC

0 comments on commit 99bcf21

Please sign in to comment.