Add infrastructure and example for native CUDA

alpaka-group · Feb 7, 2025 · d65aa6a · d65aa6a
1 parent a288377
commit d65aa6a
Show file tree

Hide file tree

Showing 7 changed files with 334 additions and 5 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -42,6 +42,8 @@ target_include_directories(
   ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
                          $<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
 )
+target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)
+
 
 option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
 if(mallocMC_BUILD_TESTING)

diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake
@@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
 # alpaka
 CPMDeclarePackage(alpaka
   NAME alpaka
-  GIT_TAG 1.2.0
-  GITHUB_REPOSITORY alpaka-group/alpaka
+  # temporary solution until this is merged into alpaka
+  GIT_TAG add-option-for-installation
+  GITHUB_REPOSITORY chillenzer/alpaka
   OPTIONS
-    "alpaka_CXX_STANDARD 20"
+    "alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
   # It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
   # However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
   # Setting the following option tells CPM to not use the cache.

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -12,8 +12,13 @@ add_subdirectory(
     ${CMAKE_BINARY_DIR}/examples/getAvailableSlots
 )
 
+add_subdirectory(
+    ${CMAKE_CURRENT_LIST_DIR}/native-cuda
+    ${CMAKE_BINARY_DIR}/examples/native-cuda
+)
+
 add_custom_target(
     mallocMCExamples
-    DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
+    DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
     COMMENT "Shortcut for building all examples."
 )
diff --git a/examples/native-cuda/CMakeLists.txt b/examples/native-cuda/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.14...3.22)
+
+check_language(CUDA)
+if (CMAKE_CUDA_COMPILER)
+  project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)
+
+  # --- Import tools ----
+
+  include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)
+
+  # ---- Dependencies ----
+
+  include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
+  CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)
+
+  if(NOT TARGET mallocMC)
+    CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+  endif()
+
+  # ---- Create standalone executable ----
+
+  add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)
+
+  set_target_properties(${PROJECT_NAME}
+    PROPERTIES
+        CXX_STANDARD 20
+        OUTPUT_NAME ${PROJECT_NAME}
+        CXX_STANDARD_REQUIRED ON
+        CXX_EXTENSIONS OFF
+  )
+
+  target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
+endif()
diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu
@@ -0,0 +1,104 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2025 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <mallocMC/mallocMC.cuh>
+
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <span>
+
+/**
+ * @brief Computes the sum of squares of the first `n` natural numbers.
+ *
+ * This function calculates the sum of squares of the first `n` natural numbers using the formula:
+ * \[
+ * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
+ * \]
+ * It's used to check the computed value in the kernel.
+ *
+ * @param n The number of natural numbers to consider.
+ * @return The sum of squares of the first `n` natural numbers.
+ */
+__device__ auto sumOfSquares(auto const n)
+{
+    return (n * (n + 1) * (2 * n + 1)) / 6;
+}
+
+/**
+ * @brief Computes the dot product of two vectors for each thread.
+ *
+ * This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
+ * Each thread allocates memory for its own vectors, initializes them with consecutive values,
+ * computes the dot product, and checks if the result matches the expected value.
+ * If the result does not match, the thread prints an error message and halts execution.
+ *
+ * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
+ * @param numValues The number of elements in each vector.
+ *
+ * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
+ * native CUDA application.
+ */
+__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
+{
+    uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // Not very realistic, all threads are doing this on their own:
+    auto a = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+    auto b = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+
+    std::iota(std::begin(a), std::end(a), tid);
+    std::iota(std::begin(b), std::end(b), tid);
+
+    uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);
+
+    auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
+    if(result != expected)
+    {
+        printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
+        __trap();
+    }
+
+    memoryManager.free(a.data());
+    memoryManager.free(b.data());
+}
+
+int main()
+{
+    size_t const heapSize = 1024U * 1024U * 1024U;
+    uint64_t const numValues = 32U;
+    mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
+    auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};
+
+    std::cout << "Running native CUDA kernel." << std::endl;
+    oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
+}
diff --git a/include/mallocMC/allocator.hpp b/include/mallocMC/allocator.hpp
@@ -195,7 +195,7 @@ namespace mallocMC
         }
 
         ALPAKA_FN_HOST
-        auto getAllocatorHandle() -> AllocatorHandle
+        auto getAllocatorHandle() const -> AllocatorHandle
         {
             return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
         }

diff --git a/include/mallocMC/mallocMC.cuh b/include/mallocMC/mallocMC.cuh
@@ -0,0 +1,184 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2025 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include "mallocMC/alignmentPolicies/Shrink.hpp"
+#include "mallocMC/creationPolicies/FlatterScatter.hpp"
+#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp"
+
+#include <mallocMC/mallocMC.hpp>
+#include <sys/types.h>
+
+#include <cstdint>
+
+namespace mallocMC
+{
+    // This namespace implements an alpaka-agnostic interface by choosing some reasonable defaults working fine for
+    // CUDA devices. Further below, we export the necessary names to the global mallocMC:: namespace. See below if
+    // you're only interested in usage. Look inside if you want to understand what we've done here or want to port this
+    // to other architectures.
+    namespace detail
+    {
+        using Dim = alpaka::DimInt<1>;
+        using Idx = std::uint32_t;
+        using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+        // Hide the alpaka-specific Acc argument of `ReservePoolPolicies::AlpakaBuf`.
+        using CudaAlpakaBuf = ReservePoolPolicies::AlpakaBuf<Acc>;
+
+        /**
+         * @brief Allocator template with hidden alpaka-specifics.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = CudaAlpakaBuf,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        using CudaAllocator = Allocator<
+            Acc,
+            T_CreationPolicy,
+            T_DistributionPolicy,
+            T_OOMPolicy,
+            T_ReservePoolPolicy,
+            T_AlignmentPolicy>;
+
+        /**
+         * @brief Host-side infrastructure needed for setting up everything.
+         *
+         * You need to create an instance of this on the host. It provides the alpaka infrastructure and sets up
+         * everything on the device side, so you can get started allocating stuff.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        struct CudaHostInfrastructure
+        {
+            using MyAllocatorType = CudaAllocator<
+                T_CreationPolicy,
+                T_DistributionPolicy,
+                T_OOMPolicy,
+                T_ReservePoolPolicy,
+                T_AlignmentPolicy>;
+
+            // Keep this first, so compiler-generated constructors can be called as just
+            // CudaHostInfrastructure<>{heapSize};
+            size_t heapSize{};
+
+            // All of this is necessary alpaka infrastructure.
+            alpaka::Platform<Acc> const platform{};
+            std::remove_cv_t<decltype(alpaka::getDevByIdx(platform, 0))> const dev{alpaka::getDevByIdx(platform, 0)};
+            alpaka::Queue<Acc, alpaka::NonBlocking> queue{dev};
+
+            // This is our actual host-side instance of the allocator. It sets up everything on the device and provides
+            // the handle that we can pass to kernels.
+            MyAllocatorType hostInstance{dev, queue, heapSize};
+        };
+
+        /**
+         * @brief Memory manager to pass to kernels.
+         *
+         * Create this on the host and pass it to your kernels. It's a lightweight object barely more than a pointer,
+         * so you can just copy it around as needed. Its main purpose is to provide an alpaka-agnostic interface by
+         * adding an accelerator internally before forwarding malloc/free calls to mallocMC.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        struct CudaMemoryManager
+        {
+            using MyHostInfrastructure = CudaHostInfrastructure<
+                T_CreationPolicy,
+                T_DistributionPolicy,
+                T_OOMPolicy,
+                T_ReservePoolPolicy,
+                T_AlignmentPolicy>;
+
+            /**
+             * @brief Construct the memory manager from the host infrastructure.
+             *
+             * @param hostInfrastructure Reference to the host infrastructure.
+             */
+            explicit CudaMemoryManager(MyHostInfrastructure const& hostInfrastructure)
+                : deviceHandle(hostInfrastructure.hostInstance.getAllocatorHandle())
+            {
+            }
+
+            /**
+             * @brief Allocates memory on the device.
+             *
+             * @param size Size of the memory to allocate.
+             * @return Pointer to the allocated memory.
+             */
+            __device__ __forceinline__ void* malloc(size_t size)
+            {
+                // This is cheating a tiny little bit. The accelerator could, in general, be a stateful object but
+                // concretely for CUDA and HIP it just forwards to the corresponding API calls, so it doesn't actually
+                // carry any information by itself. We're rather using it as a tag here.
+                std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
+                return deviceHandle.malloc(*reinterpret_cast<Acc*>(fakeAccMemory.data()), size);
+            }
+
+            /**
+             * @brief Frees memory on the device.
+             *
+             * @param ptr Pointer to the memory to free.
+             */
+            __device__ __forceinline__ void free(void* ptr)
+            {
+                std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
+                deviceHandle.free(*reinterpret_cast<Acc*>(fakeAccMemory.data()), ptr);
+            }
+
+            /**
+             * @brief Handle to the device allocator.
+             *
+             * This is what actually does the work in mallocMC. We forward all our calls to this.
+             */
+            MyHostInfrastructure::MyAllocatorType::AllocatorHandle deviceHandle;
+        };
+    } // namespace detail
+
+    // Use the following in your native CUDA code and you are good to go! All alpaka-specific interfaces are patched
+    // away.
+    using detail::CudaAllocator;
+    using detail::CudaHostInfrastructure;
+    using detail::CudaMemoryManager;
+
+    namespace ReservePoolPolicies
+    {
+        // This is provided because the original ReservePoolPolicies::AlpakaBuf takes an alpaka::Acc as template
+        // argument.
+        using detail::CudaAlpakaBuf;
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC