Skip to content

Commit

Permalink
Add infrastructure and example for native CUDA
Browse files Browse the repository at this point in the history
  • Loading branch information
chillenzer committed Feb 7, 2025
1 parent a288377 commit d65aa6a
Show file tree
Hide file tree
Showing 7 changed files with 334 additions and 5 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ target_include_directories(
${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
)
target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)


option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
if(mallocMC_BUILD_TESTING)
Expand Down
7 changes: 4 additions & 3 deletions cmake/package-lock.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
# alpaka
CPMDeclarePackage(alpaka
NAME alpaka
GIT_TAG 1.2.0
GITHUB_REPOSITORY alpaka-group/alpaka
# temporary solution until this is merged into alpaka
GIT_TAG add-option-for-installation
GITHUB_REPOSITORY chillenzer/alpaka
OPTIONS
"alpaka_CXX_STANDARD 20"
"alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
# It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
# However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
# Setting the following option tells CPM to not use the cache.
Expand Down
7 changes: 6 additions & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@ add_subdirectory(
${CMAKE_BINARY_DIR}/examples/getAvailableSlots
)

add_subdirectory(
${CMAKE_CURRENT_LIST_DIR}/native-cuda
${CMAKE_BINARY_DIR}/examples/native-cuda
)

add_custom_target(
mallocMCExamples
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
COMMENT "Shortcut for building all examples."
)
33 changes: 33 additions & 0 deletions examples/native-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
cmake_minimum_required(VERSION 3.14...3.22)

check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)

# --- Import tools ----

include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)

# ---- Dependencies ----

include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)

if(NOT TARGET mallocMC)
CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
endif()

# ---- Create standalone executable ----

add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)

set_target_properties(${PROJECT_NAME}
PROPERTIES
CXX_STANDARD 20
OUTPUT_NAME ${PROJECT_NAME}
CXX_STANDARD_REQUIRED ON
CXX_EXTENSIONS OFF
)

target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
endif()
104 changes: 104 additions & 0 deletions examples/native-cuda/source/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
mallocMC: Memory Allocator for Many Core Architectures.
https://www.hzdr.de/crp
Copyright 2025 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include <mallocMC/mallocMC.cuh>

#include <cstdint>
#include <cstdlib>
#include <functional>
#include <span>

/**
* @brief Computes the sum of squares of the first `n` natural numbers.
*
* This function calculates the sum of squares of the first `n` natural numbers using the formula:
* \[
* \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
* \]
* It's used to check the computed value in the kernel.
*
* @param n The number of natural numbers to consider.
* @return The sum of squares of the first `n` natural numbers.
*/
__device__ auto sumOfSquares(auto const n)
{
return (n * (n + 1) * (2 * n + 1)) / 6;
}

/**
* @brief Computes the dot product of two vectors for each thread.
*
* This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
* Each thread allocates memory for its own vectors, initializes them with consecutive values,
* computes the dot product, and checks if the result matches the expected value.
* If the result does not match, the thread prints an error message and halts execution.
*
* @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
* @param numValues The number of elements in each vector.
*
* @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
* native CUDA application.
*/
__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
{
uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;

// Not very realistic, all threads are doing this on their own:
auto a = std::span<uint64_t>(
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
numValues);
auto b = std::span<uint64_t>(
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
numValues);

std::iota(std::begin(a), std::end(a), tid);
std::iota(std::begin(b), std::end(b), tid);

uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);

auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
if(result != expected)
{
printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
__trap();
}

memoryManager.free(a.data());
memoryManager.free(b.data());
}

int main()
{
size_t const heapSize = 1024U * 1024U * 1024U;
uint64_t const numValues = 32U;
mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};

std::cout << "Running native CUDA kernel." << std::endl;
oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
}
2 changes: 1 addition & 1 deletion include/mallocMC/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ namespace mallocMC
}

ALPAKA_FN_HOST
auto getAllocatorHandle() -> AllocatorHandle
auto getAllocatorHandle() const -> AllocatorHandle
{
return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
}
Expand Down
184 changes: 184 additions & 0 deletions include/mallocMC/mallocMC.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/*
mallocMC: Memory Allocator for Many Core Architectures.
https://www.hzdr.de/crp
Copyright 2025 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include "mallocMC/alignmentPolicies/Shrink.hpp"
#include "mallocMC/creationPolicies/FlatterScatter.hpp"
#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp"

#include <mallocMC/mallocMC.hpp>
#include <sys/types.h>

#include <cstdint>

namespace mallocMC
{
// This namespace implements an alpaka-agnostic interface by choosing some reasonable defaults working fine for
// CUDA devices. Further below, we export the necessary names to the global mallocMC:: namespace. See below if
// you're only interested in usage. Look inside if you want to understand what we've done here or want to port this
// to other architectures.
namespace detail
{
using Dim = alpaka::DimInt<1>;
using Idx = std::uint32_t;
using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;

// Hide the alpaka-specific Acc argument of `ReservePoolPolicies::AlpakaBuf`.
using CudaAlpakaBuf = ReservePoolPolicies::AlpakaBuf<Acc>;

/**
* @brief Allocator template with hidden alpaka-specifics.
*/
template<
typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
typename T_DistributionPolicy = DistributionPolicies::Noop,
typename T_OOMPolicy = OOMPolicies::ReturnNull,
typename T_ReservePoolPolicy = CudaAlpakaBuf,
typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
using CudaAllocator = Allocator<
Acc,
T_CreationPolicy,
T_DistributionPolicy,
T_OOMPolicy,
T_ReservePoolPolicy,
T_AlignmentPolicy>;

/**
* @brief Host-side infrastructure needed for setting up everything.
*
* You need to create an instance of this on the host. It provides the alpaka infrastructure and sets up
* everything on the device side, so you can get started allocating stuff.
*/
template<
typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
typename T_DistributionPolicy = DistributionPolicies::Noop,
typename T_OOMPolicy = OOMPolicies::ReturnNull,
typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
struct CudaHostInfrastructure
{
using MyAllocatorType = CudaAllocator<
T_CreationPolicy,
T_DistributionPolicy,
T_OOMPolicy,
T_ReservePoolPolicy,
T_AlignmentPolicy>;

// Keep this first, so compiler-generated constructors can be called as just
// CudaHostInfrastructure<>{heapSize};
size_t heapSize{};

// All of this is necessary alpaka infrastructure.
alpaka::Platform<Acc> const platform{};
std::remove_cv_t<decltype(alpaka::getDevByIdx(platform, 0))> const dev{alpaka::getDevByIdx(platform, 0)};
alpaka::Queue<Acc, alpaka::NonBlocking> queue{dev};

// This is our actual host-side instance of the allocator. It sets up everything on the device and provides
// the handle that we can pass to kernels.
MyAllocatorType hostInstance{dev, queue, heapSize};
};

/**
* @brief Memory manager to pass to kernels.
*
* Create this on the host and pass it to your kernels. It's a lightweight object barely more than a pointer,
* so you can just copy it around as needed. Its main purpose is to provide an alpaka-agnostic interface by
* adding an accelerator internally before forwarding malloc/free calls to mallocMC.
*/
template<
typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
typename T_DistributionPolicy = DistributionPolicies::Noop,
typename T_OOMPolicy = OOMPolicies::ReturnNull,
typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
struct CudaMemoryManager
{
using MyHostInfrastructure = CudaHostInfrastructure<
T_CreationPolicy,
T_DistributionPolicy,
T_OOMPolicy,
T_ReservePoolPolicy,
T_AlignmentPolicy>;

/**
* @brief Construct the memory manager from the host infrastructure.
*
* @param hostInfrastructure Reference to the host infrastructure.
*/
explicit CudaMemoryManager(MyHostInfrastructure const& hostInfrastructure)
: deviceHandle(hostInfrastructure.hostInstance.getAllocatorHandle())
{
}

/**
* @brief Allocates memory on the device.
*
* @param size Size of the memory to allocate.
* @return Pointer to the allocated memory.
*/
__device__ __forceinline__ void* malloc(size_t size)
{
// This is cheating a tiny little bit. The accelerator could, in general, be a stateful object but
// concretely for CUDA and HIP it just forwards to the corresponding API calls, so it doesn't actually
// carry any information by itself. We're rather using it as a tag here.
std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
return deviceHandle.malloc(*reinterpret_cast<Acc*>(fakeAccMemory.data()), size);
}

/**
* @brief Frees memory on the device.
*
* @param ptr Pointer to the memory to free.
*/
__device__ __forceinline__ void free(void* ptr)
{
std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
deviceHandle.free(*reinterpret_cast<Acc*>(fakeAccMemory.data()), ptr);
}

/**
* @brief Handle to the device allocator.
*
* This is what actually does the work in mallocMC. We forward all our calls to this.
*/
MyHostInfrastructure::MyAllocatorType::AllocatorHandle deviceHandle;
};
} // namespace detail

// Use the following in your native CUDA code and you are good to go! All alpaka-specific interfaces are patched
// away.
using detail::CudaAllocator;
using detail::CudaHostInfrastructure;
using detail::CudaMemoryManager;

namespace ReservePoolPolicies
{
// This is provided because the original ReservePoolPolicies::AlpakaBuf takes an alpaka::Acc as template
// argument.
using detail::CudaAlpakaBuf;
} // namespace ReservePoolPolicies
} // namespace mallocMC

0 comments on commit d65aa6a

Please sign in to comment.