Update utils

alpaka-group · Nov 8, 2024 · 99bcf21 · 99bcf21
1 parent a6d7a9c
commit 99bcf21
Showing 1 changed file with 82 additions and 161 deletions.
diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp
@@ -5,12 +5,13 @@
 
   Copyright (C) 2012 Institute for Computer Graphics and Vision,
                      Graz University of Technology
-  Copyright (C) 2014 Institute of Radiation Physics,
+  Copyright (C) 2014-2024 Institute of Radiation Physics,
                      Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
               Michael Kenzel - kenzel ( at ) icg.tugraz.at
               Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -34,16 +35,14 @@
 #pragma once
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/core/Common.hpp>
+#include <sys/types.h>
 
 #ifdef _MSC_VER
 #    include <intrin.h>
 #endif
 
-#include <atomic>
 #include <cstdint>
-#include <sstream>
-#include <stdexcept>
-#include <string>
 #include <type_traits>
 
 /* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__
@@ -56,38 +55,25 @@
 
 namespace mallocMC
 {
-    template<int PSIZE>
-    class __PointerEquivalent
-    {
-    public:
-        using type = unsigned int;
-    };
-    template<>
-    class __PointerEquivalent<8>
-    {
-    public:
-        using type = unsigned long long;
-    };
 
-#if defined(__CUDA_ARCH__)
-    constexpr auto warpSize = 32; // TODO
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-// defined:
-// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400
-// overview wave front size:
-// https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115
-// gfx10XX has 32 threads per wavefront else 64
+    template<typename TAcc>
+    constexpr uint32_t warpSize = 1U;
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuCudaRt<TDim, TIdx>> = 32U;
+#endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
 #    if(HIP_VERSION_MAJOR >= 4)
-    constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE;
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = __AMDGCN_WAVEFRONT_SIZE;
 #    else
-    constexpr auto warpSize = 64;
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = 64;
 #    endif
-#else
-    constexpr auto warpSize = 1;
 #endif
 
-    using PointerEquivalent = mallocMC::__PointerEquivalent<sizeof(char*)>::type;
-
     ALPAKA_FN_ACC inline auto laneid()
     {
 #if defined(__CUDA_ARCH__)
@@ -97,7 +83,7 @@ namespace mallocMC
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__)
         return __lane_id();
 #else
-        return 0u;
+        return 0U;
 #endif
     }
 
@@ -108,83 +94,87 @@ namespace mallocMC
      *
      * @return current index of the warp
      */
-    ALPAKA_FN_ACC inline auto warpid()
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t
     {
-#if defined(__CUDA_ARCH__)
-        std::uint32_t mywarpid;
+        return 0U;
+    }
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is
+    // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.`
+    __device__ inline auto warpid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
+        std::uint32_t mywarpid = 0;
         asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid));
         return mywarpid;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+    }
+#endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
         // get wave id
         // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024
         return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4));
-#else
-        return 0u;
+    }
 #endif
+
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t
+    {
+        return 0U;
     }
 
-    ALPAKA_FN_ACC inline auto smid()
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
     {
-#if defined(__CUDA_ARCH__)
-        std::uint32_t mysmid;
+        std::uint32_t mysmid = 0;
         asm("mov.u32 %0, %%smid;" : "=r"(mysmid));
         return mysmid;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        return __smid();
-#else
-        return 0u;
+    }
 #endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
+        return __smid();
     }
+#endif
 
-    ALPAKA_FN_ACC inline auto lanemask_lt()
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/)
+    {
+        return 0U;
+    }
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/)
     {
-#if defined(__CUDA_ARCH__)
         std::uint32_t lanemask;
         asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask));
         return lanemask;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        return __lanemask_lt();
-#else
-        return 0u;
-#endif
     }
-
-    ALPAKA_FN_ACC inline auto ballot(int pred)
-    {
-#if defined(__CUDA_ARCH__)
-        return __ballot_sync(__activemask(), pred);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return __ballot(pred);
-#else
-        return 1u;
 #endif
-    }
-
 
-    ALPAKA_FN_ACC inline auto activemask()
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/)
     {
-#if defined(__CUDA_ARCH__)
-        return __activemask();
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ballot(1);
-#else
-        return 1u;
-#endif
+        return __lanemask_lt();
     }
+#endif
 
-    template<class T>
-    ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T
-    {
-        return (a + b - 1) / b;
-    }
 
     /** the maximal number threads per block, valid for sm_2.X - sm_7.5
      *
      * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
      */
-    constexpr uint32_t maxThreadsPerBlock = 1024;
+    constexpr uint32_t maxThreadsPerBlock = 1024U;
 
     /** warp id within a cuda block
      *
@@ -199,96 +189,27 @@ namespace mallocMC
         const auto localId = alpaka::mapIdx<1>(
             alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc),
             alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc))[0];
-        return localId / warpSize;
-    }
-
-    template<typename T>
-    ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t
-    {
-#if defined(__CUDA_ARCH__)
-        return ::__ffs(mask);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ::__ffsll(static_cast<unsigned long long int>(mask));
-#else
-        if(mask == 0)
-            return 0;
-        auto i = 1u;
-        while((mask & 1) == 0)
-        {
-            mask >>= 1;
-            i++;
-        }
-        return i;
-#endif
+        return localId / warpSize<AlpakaAcc>;
     }
 
-    template<typename T>
-    ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t
+    template<typename T, typename U, typename = std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T
     {
-#if defined(__CUDA_ARCH__)
-        return ::__popc(mask);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ::__popcll(static_cast<unsigned long long int>(mask));
-#else
-        // cf.
-        // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
-        std::uint32_t count = 0;
-        while(mask)
-        {
-            count++;
-            mask &= mask - 1;
-        }
-        return count;
-#endif
+        return (numerator + (denominator - 1)) / denominator;
     }
 
-    // Threadfence implementations will maybe moved later into alpaka
-    template<typename T_Acc, typename T_Sfinae = void>
-    struct ThreadFence
-    {
-        // CPU only implementation
-        static void device()
-        {
-            std::atomic_thread_fence(std::memory_order_seq_cst);
-        }
-
-        static void block()
-        {
-            std::atomic_thread_fence(std::memory_order_seq_cst);
-        }
-    };
-
-    template<typename... T_AccArgs>
-    struct ThreadFence<alpaka::AccGpuUniformCudaHipRt<T_AccArgs...>, void>
-    {
-        static ALPAKA_FN_ACC void device()
-        {
-#if MALLOCMC_DEVICE_COMPILE
-            __threadfence();
-#endif
-        }
-
-        static ALPAKA_FN_ACC void block()
-        {
-#if MALLOCMC_DEVICE_COMPILE
-            __threadfence_block();
-#endif
-        }
-    };
-
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T_Acc>
-    ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc)
+    template<typename T_size>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf(
+        void const* const pointer,
+        void const* const start,
+        T_size const stepSize) -> std::make_signed_t<T_size>
     {
-        ThreadFence<T_Acc>::device();
+        return std::distance(reinterpret_cast<char const*>(start), reinterpret_cast<char const*>(pointer)) / stepSize;
     }
 
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T_Acc>
-    ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc)
+    template<typename TAcc, typename T>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target)
     {
-        ThreadFence<T_Acc>::block();
+        return alpaka::atomicCas(acc, &target, static_cast<T>(0U), static_cast<T>(0U));
     }
 } // namespace mallocMC