From e68ee1af261f70ee5ee5fe11d656b246ffb0835a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Oct 2021 23:19:19 -0700 Subject: [PATCH 1/7] Add Allocator abstraction Add an Allocator abstraction that looks similar to umpire. Use this abstraction in the cuda/hip backends. Refactor basic_mempool into AllocatorPool as the default cuda/hip Allocators. Add support for users to set new Allocators for cuda/hip. --- include/RAJA/RAJA.hpp | 2 +- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 171 +++++-- include/RAJA/policy/cuda/reduce.hpp | 28 +- include/RAJA/policy/cuda/scan.hpp | 16 +- include/RAJA/policy/cuda/sort.hpp | 40 +- include/RAJA/policy/hip/MemUtils_HIP.hpp | 171 +++++-- include/RAJA/policy/hip/reduce.hpp | 28 +- include/RAJA/policy/hip/scan.hpp | 16 +- include/RAJA/policy/hip/sort.hpp | 40 +- include/RAJA/policy/sycl/MemUtils_SYCL.hpp | 1 - include/RAJA/util/Allocator.hpp | 100 ++++ include/RAJA/util/AllocatorPool.hpp | 518 +++++++++++++++++++++ include/RAJA/util/SoAPtr.hpp | 40 +- include/RAJA/util/basic_mempool.hpp | 427 ----------------- 14 files changed, 1001 insertions(+), 597 deletions(-) create mode 100644 include/RAJA/util/Allocator.hpp create mode 100644 include/RAJA/util/AllocatorPool.hpp delete mode 100644 include/RAJA/util/basic_mempool.hpp diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index 90cb5515f0..9945bd66e0 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -29,7 +29,7 @@ #include "RAJA/config.hpp" #include "RAJA/util/Operators.hpp" -#include "RAJA/util/basic_mempool.hpp" +#include "RAJA/util/Allocator.hpp" #include "RAJA/util/camp_aliases.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 89bbea87d4..b7275d044c 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -28,10 +28,12 @@ #include #include #include +#include #include "nvToolsExt.h" -#include "RAJA/util/basic_mempool.hpp" +#include "RAJA/util/Allocator.hpp" +#include "RAJA/util/AllocatorPool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" #include "RAJA/util/macros.hpp" @@ -46,70 +48,175 @@ namespace RAJA namespace cuda { +namespace detail +{ -//! Allocator for pinned memory for use in basic_mempool -struct PinnedAllocator { +//! Allocator for device memory for use in AllocatorPool +struct DeviceBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; - cudaErrchk(cudaHostAlloc(&ptr, nbytes, cudaHostAllocMapped)); + cudaErrchk(cudaMalloc(&ptr, nbytes)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { - cudaErrchk(cudaFreeHost(ptr)); - return true; + cudaErrchk(cudaFree(ptr)); } }; -//! Allocator for device memory for use in basic_mempool -struct DeviceAllocator { +//! Allocator for pre-zeroed device memory for use in AllocatorPool +// Note: Memory must be zero when returned to mempool +struct DeviceZeroedBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; cudaErrchk(cudaMalloc(&ptr, nbytes)); + cudaErrchk(cudaMemset(ptr, 0, nbytes)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { cudaErrchk(cudaFree(ptr)); - return true; } }; -//! Allocator for pre-zeroed device memory for use in basic_mempool -// Note: Memory must be zero when returned to mempool -struct DeviceZeroedAllocator { +//! Allocator for pinned memory for use in AllocatorPool +struct PinnedBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; - cudaErrchk(cudaMalloc(&ptr, nbytes)); - cudaErrchk(cudaMemset(ptr, 0, nbytes)); + cudaErrchk(cudaHostAlloc(&ptr, nbytes, cudaHostAllocMapped)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { - cudaErrchk(cudaFree(ptr)); - return true; + cudaErrchk(cudaFreeHost(ptr)); } }; -using device_mempool_type = basic_mempool::MemPool; -using device_zeroed_mempool_type = - basic_mempool::MemPool; -using pinned_mempool_type = basic_mempool::MemPool; +//! Make default allocators used by RAJA internally +inline std::unique_ptr make_default_device_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::cuda::default_device_allocator"), + DeviceBaseAllocator())); +} +/// +inline std::unique_ptr make_default_device_zeroed_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::cuda::default_device_zeroed_allocator"), + DeviceZeroedBaseAllocator())); +} +/// +inline std::unique_ptr make_default_pinned_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::cuda::default_pinned_allocator"), + PinnedBaseAllocator())); +} + +//! Storage for allocators used by RAJA internally +inline std::unique_ptr& get_device_allocator() +{ + static std::unique_ptr allocator( + make_default_device_allocator()); + return allocator; +} +/// +inline std::unique_ptr& get_device_zeroed_allocator() +{ + static std::unique_ptr allocator( + make_default_device_zeroed_allocator()); + return allocator; +} +/// +inline std::unique_ptr& get_pinned_allocator() +{ + static std::unique_ptr allocator( + make_default_pinned_allocator()); + return allocator; +} + +} // namespace detail + +//! Sets the allocator used by RAJA internally by making an allocator of +// allocator_type with the given arguments. It is an error to change the +// allocator when any memory is allocated. This routine is not thread safe. +template < typename allocator_type, typename ... Args > +inline void set_device_allocator(Args&&... args) +{ + detail::get_device_allocator().release(); + detail::get_device_allocator().reset( + new allocator_type(std::forward(args)...)); +} +/// +template < typename allocator_type, typename ... Args > +inline void set_device_zeroed_allocator(Args&&... args) +{ + detail::get_device_zeroed_allocator().release(); + detail::get_device_zeroed_allocator().reset( + new allocator_type(std::forward(args)...)); +} +/// +template < typename allocator_type, typename ... Args > +inline void set_pinned_allocator(Args&&... args) +{ + detail::get_pinned_allocator().release(); + detail::get_pinned_allocator().reset( + new allocator_type(std::forward(args)...)); +} + +//! Reset the allocator used by RAJA internally. This will destroy any existing +// allocator and replace it with the kind of allocator used by default. It is +// an error to change the allocator when any memory is allocated. This routine +// is not thread safe. +inline void reset_device_allocator() +{ + detail::get_device_allocator().release(); + detail::get_device_allocator() = + detail::make_default_device_allocator(); +} +inline void reset_device_zeroed_allocator() +{ + detail::get_device_zeroed_allocator().release(); + detail::get_device_zeroed_allocator() = + detail::make_default_device_zeroed_allocator(); +} +inline void reset_pinned_allocator() +{ + detail::get_pinned_allocator().release(); + detail::get_pinned_allocator() = + detail::make_default_pinned_allocator(); +} + +//! Gets the allocator used by RAJA internally. This allows the user to query +// the memory stats of the allocator. +inline RAJA::Allocator& get_device_allocator() +{ + return *detail::get_device_allocator(); +} +/// +inline RAJA::Allocator& get_device_zeroed_allocator() +{ + return *detail::get_device_zeroed_allocator(); +} +/// +inline RAJA::Allocator& get_pinned_allocator() +{ + return *detail::get_pinned_allocator(); +} + namespace detail { diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 9c0e358b6a..d9f6fd722e 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -32,7 +32,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/SoAArray.hpp" #include "RAJA/util/SoAPtr.hpp" -#include "RAJA/util/basic_mempool.hpp" +#include "RAJA/util/Allocator.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" @@ -695,7 +695,7 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = cuda::pinned_mempool_type::getInstance().template malloc(1); + Node* n = cuda::get_pinned_allocator().template allocate(1); n->next = rn->node_list; rn->node_list = n; return &n->value; @@ -718,7 +718,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - cuda::pinned_mempool_type::getInstance().free(n); + cuda::get_pinned_allocator().deallocate(n); } resource_list = rn->next; free(rn); @@ -751,7 +751,7 @@ struct Reduce_Data { mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; + RAJA::detail::SoAPtr device; bool own_device_ptr; Reduce_Data() : Reduce_Data(T(), T()){}; @@ -813,9 +813,9 @@ struct Reduce_Data { if (act) { cuda_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device.allocate(cuda::get_device_allocator(), numBlocks); + device_count = cuda::get_device_zeroed_allocator() + .template allocate(1); own_device_ptr = true; } return act; @@ -827,8 +827,8 @@ struct Reduce_Data { { bool act = own_device_ptr; if (act) { - device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + device.deallocate(cuda::get_device_allocator()); + cuda::get_device_zeroed_allocator().deallocate(device_count); device_count = nullptr; own_device_ptr = false; } @@ -901,9 +901,9 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device = cuda::get_device_allocator().template allocate(1); + device_count = cuda::get_device_zeroed_allocator() + .template allocate(1); own_device_ptr = true; } return act; @@ -915,9 +915,9 @@ struct ReduceAtomic_Data { { bool act = own_device_ptr; if (act) { - device_mempool_type::getInstance().free(device); + cuda::get_device_allocator().deallocate(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + cuda::get_device_zeroed_allocator().deallocate(device_count); device_count = nullptr; own_device_ptr = false; } diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp index 978a1cb0a1..b49b5aeef8 100644 --- a/include/RAJA/policy/cuda/scan.hpp +++ b/include/RAJA/policy/cuda/scan.hpp @@ -67,7 +67,7 @@ inclusive_inplace( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, @@ -78,7 +78,7 @@ inclusive_inplace( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); cuda::launch(cuda_res, Async); @@ -120,7 +120,7 @@ exclusive_inplace( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, @@ -132,7 +132,7 @@ exclusive_inplace( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); cuda::launch(cuda_res, Async); @@ -173,7 +173,7 @@ inclusive( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, @@ -184,7 +184,7 @@ inclusive( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); cuda::launch(cuda_res, Async); @@ -228,7 +228,7 @@ exclusive( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, @@ -240,7 +240,7 @@ exclusive( len, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); cuda::launch(cuda_res, Async); diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp index e91831bbc8..9d5546503f 100644 --- a/include/RAJA/policy/cuda/sort.hpp +++ b/include/RAJA/policy/cuda/sort.hpp @@ -95,7 +95,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = cuda::device_mempool_type::getInstance().malloc(len); + R* d_out = cuda::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -113,7 +113,7 @@ stable( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -125,7 +125,7 @@ stable( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); if (d_keys.Current() == d_out) { @@ -133,7 +133,7 @@ stable( cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_out); + cuda::get_device_allocator().deallocate(d_out); cuda::launch(cuda_res, Async); @@ -163,7 +163,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = cuda::device_mempool_type::getInstance().malloc(len); + R* d_out = cuda::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -181,7 +181,7 @@ stable( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -193,7 +193,7 @@ stable( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); if (d_keys.Current() == d_out) { @@ -201,7 +201,7 @@ stable( cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_out); + cuda::get_device_allocator().deallocate(d_out); cuda::launch(cuda_res, Async); @@ -338,8 +338,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = cuda::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = cuda::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = cuda::get_device_allocator().template allocate(len); + V* d_vals_out = cuda::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -359,7 +359,7 @@ stable_pairs( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -372,7 +372,7 @@ stable_pairs( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); if (d_keys.Current() == d_keys_out) { @@ -385,8 +385,8 @@ stable_pairs( cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_keys_out); - cuda::device_mempool_type::getInstance().free(d_vals_out); + cuda::get_device_allocator().deallocate(d_keys_out); + cuda::get_device_allocator().deallocate(d_vals_out); cuda::launch(cuda_res, Async); @@ -420,8 +420,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = cuda::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = cuda::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = cuda::get_device_allocator().template allocate(len); + V* d_vals_out = cuda::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -441,7 +441,7 @@ stable_pairs( stream)); // Allocate temporary storage d_temp_storage = - cuda::device_mempool_type::getInstance().malloc( + cuda::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -454,7 +454,7 @@ stable_pairs( end_bit, stream)); // Free temporary storage - cuda::device_mempool_type::getInstance().free(d_temp_storage); + cuda::get_device_allocator().deallocate(d_temp_storage); if (d_keys.Current() == d_keys_out) { @@ -467,8 +467,8 @@ stable_pairs( cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream)); } - cuda::device_mempool_type::getInstance().free(d_keys_out); - cuda::device_mempool_type::getInstance().free(d_vals_out); + cuda::get_device_allocator().deallocate(d_keys_out); + cuda::get_device_allocator().deallocate(d_vals_out); cuda::launch(cuda_res, Async); diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 58f36946e1..5f32b83b94 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -28,8 +28,10 @@ #include #include #include +#include -#include "RAJA/util/basic_mempool.hpp" +#include "RAJA/util/Allocator.hpp" +#include "RAJA/util/AllocatorPool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" #include "RAJA/util/macros.hpp" @@ -49,70 +51,175 @@ namespace RAJA namespace hip { +namespace detail +{ -//! Allocator for pinned memory for use in basic_mempool -struct PinnedAllocator { +//! Allocator for device memory for use in basic_mempool +struct DeviceBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; - hipErrchk(hipHostMalloc(&ptr, nbytes, hipHostMallocMapped)); + hipErrchk(hipMalloc(&ptr, nbytes)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { - hipErrchk(hipHostFree(ptr)); - return true; + hipErrchk(hipFree(ptr)); } }; -//! Allocator for device memory for use in basic_mempool -struct DeviceAllocator { +//! Allocator for pre-zeroed device memory for use in basic_mempool +// Note: Memory must be zero when returned to mempool +struct DeviceZeroedBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; hipErrchk(hipMalloc(&ptr, nbytes)); + hipErrchk(hipMemset(ptr, 0, nbytes)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { hipErrchk(hipFree(ptr)); - return true; } }; -//! Allocator for pre-zeroed device memory for use in basic_mempool -// Note: Memory must be zero when returned to mempool -struct DeviceZeroedAllocator { +//! Allocator for pinned memory for use in basic_mempool +struct PinnedBaseAllocator { - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) + void* allocate(size_t nbytes) { void* ptr; - hipErrchk(hipMalloc(&ptr, nbytes)); - hipErrchk(hipMemset(ptr, 0, nbytes)); + hipErrchk(hipHostMalloc(&ptr, nbytes, hipHostMallocMapped)); return ptr; } - // returns true on success, false on failure - bool free(void* ptr) + void deallocate(void* ptr) { - hipErrchk(hipFree(ptr)); - return true; + hipErrchk(hipHostFree(ptr)); } }; -using device_mempool_type = basic_mempool::MemPool; -using device_zeroed_mempool_type = - basic_mempool::MemPool; -using pinned_mempool_type = basic_mempool::MemPool; +//! Make default allocators used by RAJA internally +inline std::unique_ptr make_default_device_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::hip::default_device_allocator"), + DeviceBaseAllocator())); +} +/// +inline std::unique_ptr make_default_device_zeroed_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::hip::default_device_zeroed_allocator"), + DeviceZeroedBaseAllocator())); +} +/// +inline std::unique_ptr make_default_pinned_allocator() +{ + return std::unique_ptr( + new AllocatorPool( + std::string("RAJA::hip::default_pinned_allocator"), + PinnedBaseAllocator())); +} + +//! Storage for allocators used by RAJA internally +inline std::unique_ptr& get_device_allocator() +{ + static std::unique_ptr allocator( + make_default_device_allocator()); + return allocator; +} +/// +inline std::unique_ptr& get_device_zeroed_allocator() +{ + static std::unique_ptr allocator( + make_default_device_zeroed_allocator()); + return allocator; +} +/// +inline std::unique_ptr& get_pinned_allocator() +{ + static std::unique_ptr allocator( + make_default_pinned_allocator()); + return allocator; +} + +} // namespace detail + +//! Sets the allocator used by RAJA internally by making an allocator of +// allocator_type with the given arguments. It is an error to change the +// allocator when any memory is allocated. This routine is not thread safe. +template < typename allocator_type, typename ... Args > +inline void set_device_allocator(Args&&... args) +{ + detail::get_device_allocator().release(); + detail::get_device_allocator().reset( + new allocator_type(std::forward(args)...)); +} +/// +template < typename allocator_type, typename ... Args > +inline void set_device_zeroed_allocator(Args&&... args) +{ + detail::get_device_zeroed_allocator().release(); + detail::get_device_zeroed_allocator().reset( + new allocator_type(std::forward(args)...)); +} +/// +template < typename allocator_type, typename ... Args > +inline void set_pinned_allocator(Args&&... args) +{ + detail::get_pinned_allocator().release(); + detail::get_pinned_allocator().reset( + new allocator_type(std::forward(args)...)); +} + +//! Reset the allocator used by RAJA internally. This will destroy any existing +// allocator and replace it with the kind of allocator used by default. It is +// an error to change the allocator when any memory is allocated. This routine +// is not thread safe. +inline void reset_device_allocator() +{ + detail::get_device_allocator().release(); + detail::get_device_allocator() = + detail::make_default_device_allocator(); +} +inline void reset_device_zeroed_allocator() +{ + detail::get_device_zeroed_allocator().release(); + detail::get_device_zeroed_allocator() = + detail::make_default_device_zeroed_allocator(); +} +inline void reset_pinned_allocator() +{ + detail::get_pinned_allocator().release(); + detail::get_pinned_allocator() = + detail::make_default_pinned_allocator(); +} + +//! Gets the allocator used by RAJA internally. This allows the user to query +// the memory stats of the allocator. +inline RAJA::Allocator& get_device_allocator() +{ + return *detail::get_device_allocator(); +} +/// +inline RAJA::Allocator& get_device_zeroed_allocator() +{ + return *detail::get_device_zeroed_allocator(); +} +/// +inline RAJA::Allocator& get_pinned_allocator() +{ + return *detail::get_pinned_allocator(); +} + namespace detail { diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index d37b0fd9ff..4ab816a5d8 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -32,7 +32,7 @@ #include "RAJA/util/macros.hpp" #include "RAJA/util/SoAArray.hpp" #include "RAJA/util/SoAPtr.hpp" -#include "RAJA/util/basic_mempool.hpp" +#include "RAJA/util/Allocator.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" @@ -567,7 +567,7 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = hip::pinned_mempool_type::getInstance().template malloc(1); + Node* n = hip::get_pinned_allocator().template allocate(1); n->next = rn->node_list; rn->node_list = n; return &n->value; @@ -590,7 +590,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - hip::pinned_mempool_type::getInstance().free(n); + hip::get_pinned_allocator().deallocate(n); } resource_list = rn->next; free(rn); @@ -623,7 +623,7 @@ struct Reduce_Data { mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; + RAJA::detail::SoAPtr device; bool own_device_ptr; Reduce_Data() : Reduce_Data(T(), T()){}; @@ -683,9 +683,9 @@ struct Reduce_Data { if (act) { hip_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device.allocate(hip::get_device_allocator(), numBlocks); + device_count = hip::get_device_zeroed_allocator() + .template allocate(1); own_device_ptr = true; } return act; @@ -697,8 +697,8 @@ struct Reduce_Data { { bool act = own_device_ptr; if (act) { - device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + device.deallocate(hip::get_device_allocator()); + hip::get_device_zeroed_allocator().deallocate(device_count); device_count = nullptr; own_device_ptr = false; } @@ -769,9 +769,9 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device = hip::get_device_allocator().template allocate(1); + device_count = hip::get_device_zeroed_allocator() + .template allocate(1); own_device_ptr = true; } return act; @@ -783,9 +783,9 @@ struct ReduceAtomic_Data { { bool act = own_device_ptr; if (act) { - device_mempool_type::getInstance().free(device); + hip::get_device_allocator().deallocate(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + hip::get_device_zeroed_allocator().deallocate(device_count); device_count = nullptr; own_device_ptr = false; } diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp index 85bc494abb..18c2732175 100644 --- a/include/RAJA/policy/hip/scan.hpp +++ b/include/RAJA/policy/hip/scan.hpp @@ -83,7 +83,7 @@ inclusive_inplace( // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run #if defined(__HIPCC__) @@ -104,7 +104,7 @@ inclusive_inplace( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); hip::launch(hip_res, Async); @@ -157,7 +157,7 @@ exclusive_inplace( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run #if defined(__HIPCC__) @@ -180,7 +180,7 @@ exclusive_inplace( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); hip::launch(hip_res, Async); @@ -231,7 +231,7 @@ inclusive( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run #if defined(__HIPCC__) @@ -252,7 +252,7 @@ inclusive( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); hip::launch(hip_res, Async); @@ -307,7 +307,7 @@ exclusive( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run #if defined(__HIPCC__) @@ -330,7 +330,7 @@ exclusive( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); hip::launch(hip_res, Async); diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index 9090721ff5..49a2e95457 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -122,7 +122,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = hip::device_mempool_type::getInstance().malloc(len); + R* d_out = hip::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -150,7 +150,7 @@ stable( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -172,7 +172,7 @@ stable( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); if (detail::get_current(d_keys) == d_out) { @@ -180,7 +180,7 @@ stable( hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream)); } - hip::device_mempool_type::getInstance().free(d_out); + hip::get_device_allocator().deallocate(d_out); hip::launch(hip_res, Async); @@ -210,7 +210,7 @@ stable( int end_bit=sizeof(R)*CHAR_BIT; // Allocate temporary storage for the output array - R* d_out = hip::device_mempool_type::getInstance().malloc(len); + R* d_out = hip::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the begin buffer @@ -238,7 +238,7 @@ stable( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -260,7 +260,7 @@ stable( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); if (detail::get_current(d_keys) == d_out) { @@ -268,7 +268,7 @@ stable( hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream)); } - hip::device_mempool_type::getInstance().free(d_out); + hip::get_device_allocator().deallocate(d_out); hip::launch(hip_res, Async); @@ -403,8 +403,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = hip::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = hip::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = hip::get_device_allocator().template allocate(len); + V* d_vals_out = hip::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -435,7 +435,7 @@ stable_pairs( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -459,7 +459,7 @@ stable_pairs( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); if (detail::get_current(d_keys) == d_keys_out) { @@ -472,8 +472,8 @@ stable_pairs( hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream)); } - hip::device_mempool_type::getInstance().free(d_keys_out); - hip::device_mempool_type::getInstance().free(d_vals_out); + hip::get_device_allocator().deallocate(d_keys_out); + hip::get_device_allocator().deallocate(d_vals_out); hip::launch(hip_res, Async); @@ -507,8 +507,8 @@ stable_pairs( int end_bit=sizeof(K)*CHAR_BIT; // Allocate temporary storage for the output arrays - K* d_keys_out = hip::device_mempool_type::getInstance().malloc(len); - V* d_vals_out = hip::device_mempool_type::getInstance().malloc(len); + K* d_keys_out = hip::get_device_allocator().template allocate(len); + V* d_vals_out = hip::get_device_allocator().template allocate(len); // use cub double buffer to reduce temporary memory requirements // by allowing cub to write to the keys_begin and vals_begin buffers @@ -539,7 +539,7 @@ stable_pairs( #endif // Allocate temporary storage d_temp_storage = - hip::device_mempool_type::getInstance().malloc( + hip::get_device_allocator().template allocate( temp_storage_bytes); // Run @@ -563,7 +563,7 @@ stable_pairs( stream)); #endif // Free temporary storage - hip::device_mempool_type::getInstance().free(d_temp_storage); + hip::get_device_allocator().deallocate(d_temp_storage); if (detail::get_current(d_keys) == d_keys_out) { @@ -576,8 +576,8 @@ stable_pairs( hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream)); } - hip::device_mempool_type::getInstance().free(d_keys_out); - hip::device_mempool_type::getInstance().free(d_vals_out); + hip::get_device_allocator().deallocate(d_keys_out); + hip::get_device_allocator().deallocate(d_vals_out); hip::launch(hip_res, Async); diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp index bca7c20550..ab4bb10c8c 100644 --- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp +++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp @@ -31,7 +31,6 @@ #include #include -#include "RAJA/util/basic_mempool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp new file mode 100644 index 0000000000..f0e226bfb7 --- /dev/null +++ b/include/RAJA/util/Allocator.hpp @@ -0,0 +1,100 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file containing an implementation of a memory pool. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_UTIL_ALLOCATOR_HPP +#define RAJA_UTIL_ALLOCATOR_HPP + +#include +#include +#include + +#include "RAJA/util/camp_aliases.hpp" + +namespace RAJA +{ + +/*! \class Allocator + ****************************************************************************** + * + * \brief Allocator Provides a generic interface for allocation in RAJA + * + * Allocator& device_pool = RAJA::cuda::get_device_allocator(); + * + ****************************************************************************** + */ +struct Allocator +{ + Allocator() = default; + + // not copyable or movable + Allocator(Allocator const&) = delete; + Allocator(Allocator &&) = delete; + Allocator& operator=(Allocator const&) = delete; + Allocator& operator=(Allocator &&) = delete; + + virtual ~Allocator() = default; + + virtual void* allocate(size_t nbytes, + size_t alignment = alignof(std::max_align_t)) = 0; + + template + inline T* allocate(size_t nitems, + size_t alignment = alignof(T)) + { + return static_cast(this->allocate(sizeof(T)*nitems, alignment)); + } + + virtual void deallocate(void* ptr) = 0; + + virtual void release() = 0; + + virtual size_t getHighWatermark() const noexcept = 0; + + virtual size_t getCurrentSize() const noexcept = 0; + + virtual size_t getActualSize() const noexcept = 0; + + virtual size_t getAllocationCount() const noexcept = 0; + + virtual const std::string& getName() const noexcept = 0; + + // virtual Platform getPlatform() const noexcept = 0; +}; + +namespace detail +{ + +inline std::vector& get_allocators() +{ + static std::vector allocators; + return allocators; +} + +} /* end namespace detail */ + +/*! + * \brief Get the set of allocators used by RAJA internally + */ +inline std::vector get_allocators() +{ + return detail::get_allocators(); +} + +} /* end namespace RAJA */ + + +#endif /* RAJA_UTIL_ALLOCATOR_HPP */ diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp new file mode 100644 index 0000000000..1c8405723a --- /dev/null +++ b/include/RAJA/util/AllocatorPool.hpp @@ -0,0 +1,518 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief RAJA header file containing an implementation of a memory pool. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_UTIL_ALLOCATORPOOL_HPP +#define RAJA_UTIL_ALLOCATORPOOL_HPP + +#include "RAJA/config.hpp" + +#include +#include +#include +#include +#include +#include + +#include "RAJA/util/align.hpp" +#include "RAJA/util/mutex.hpp" +#include "RAJA/util/Allocator.hpp" + +// Note that this header is not included in RAJA.hpp +// to avoid this warning when using openmp enabled headers with +// non-openmp compilers +#if defined(RAJA_ENABLE_OPENMP) && !defined(_OPENMP) +#error RAJA configured with ENABLE_OPENMP, but OpenMP not supported by current compiler +#endif + +namespace RAJA +{ + +namespace detail +{ + +//! example allocator for AllocatorPool using allocate/deallocate +struct HostBaseAllocator { + + // returns a valid pointer on success, nullptr on failure + void* allocate(std::size_t nbytes) + { + return std::malloc(nbytes); + } + + // returns true on success, false on failure + void deallocate(void* ptr) + { + std::free(ptr); + } +}; + +//! Class abstracting a range in memory +struct MemoryChunk { + void* begin = nullptr; + void* end = nullptr; + + MemoryChunk() = default; + + MemoryChunk(void* ptr, size_t nbytes) + : begin(ptr) + , end(static_cast(begin) + nbytes) + { } + + MemoryChunk(void* begin_, void* end_) + : begin(begin_) + , end(end_) + { } + + size_t nbytes() const + { + return static_cast(end) - + static_cast(begin); + } + + explicit operator bool() const + { + return begin != nullptr; + } +}; + +/*! \class MemoryArena + ****************************************************************************** + * + * \brief MemoryArena is a map based subclass for class Allocator + * provides book-keeping to divy a large chunk of pre-allocated memory to avoid + * the overhead of malloc/free or cudaMalloc/cudaFree, etc + * + * get/give are the primary calls used by class Allocator to get aligned memory + * from the pool or give it back + * + * + ****************************************************************************** + */ +struct MemoryArena +{ + using free_type = std::map; + using free_value_type = typename free_type::value_type; + using used_type = std::map; + using used_value_type = typename used_type::value_type; + + + MemoryArena(MemoryChunk mem) + : m_allocation(mem) + { + if (m_allocation.begin == nullptr) { + RAJA_ABORT_OR_THROW("RAJA::detail::MemoryArena Attempt to create with no memory"); + } + m_free_space[m_allocation.begin] = m_allocation.end ; + } + + MemoryArena(MemoryArena const&) = delete; + MemoryArena& operator=(MemoryArena const&) = delete; + + MemoryArena(MemoryArena&&) = default; + MemoryArena& operator=(MemoryArena&&) = default; + + size_t capacity() const + { + return m_allocation.nbytes(); + } + + bool unused() const + { + return m_used_space.empty(); + } + + MemoryChunk get_allocation() + { + return m_allocation; + } + + MemoryChunk get(size_t nbytes, size_t alignment) + { + MemoryChunk mem; + if (capacity() >= nbytes) { + free_type::iterator end = m_free_space.end(); + for (free_type::iterator iter = m_free_space.begin(); iter != end; + ++iter) { + + void* adj_ptr = iter->first; + size_t cap = + static_cast(iter->second) - static_cast(adj_ptr); + + if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) { + + mem = MemoryChunk(adj_ptr, nbytes); + + remove_free_chunk(iter, + mem); + + add_used_chunk(mem); + + break; + } + } + } + return mem; + } + + MemoryChunk give(void* ptr) + { + MemoryChunk mem; + if (m_allocation.begin <= ptr && ptr < m_allocation.end) { + + used_type::iterator found = m_used_space.find(ptr); + + if (found != m_used_space.end()) { + + mem = MemoryChunk(found->first, found->second); + + add_free_chunk(mem); + + m_used_space.erase(found); + + } else { + RAJA_ABORT_OR_THROW("RAJA::detail::MemoryArena::give invalid ptr"); + } + } + return mem; + } + +private: + void add_free_chunk(MemoryChunk mem) + { + // integrates a chunk of memory into free_space + free_type::iterator invl = m_free_space.end(); + free_type::iterator next = m_free_space.lower_bound(mem.begin); + + // check if prev exists + if (next != m_free_space.begin()) { + // check if prev can cover [begin, end) + free_type::iterator prev = next; + --prev; + if (prev->second == mem.begin) { + // extend prev to cover [begin, end) + prev->second = mem.end; + + // check if prev can cover next too + if (next != invl) { + assert(next->first != mem.begin); + + if (next->first == mem.end) { + // extend prev to cover next too + prev->second = next->second; + + // remove redundant next + m_free_space.erase(next); + } + } + return; + } + } + + if (next != invl) { + assert(next->first != mem.begin); + + if (next->first == mem.end) { + // extend next to cover [begin, end) + m_free_space.insert(next, free_value_type{mem.begin, next->second}); + m_free_space.erase(next); + + return; + } + } + + // no free space adjacent to this chunk, add separate free chunk [begin, + // end) + m_free_space.insert(next, free_value_type{mem.begin, mem.end}); + } + + void remove_free_chunk(free_type::iterator iter, MemoryChunk mem) + { + void* ptr = iter->first; + void* ptr_end = iter->second; + + // fixup m_free_space, shrinking and adding chunks as needed + if (ptr != mem.begin) { + + // shrink end of current free region to [ptr, begin) + iter->second = mem.begin; + + if (mem.end != ptr_end) { + + // insert free region [end, ptr_end) after current free region + free_type::iterator next = iter; + ++next; + m_free_space.insert(next, free_value_type{mem.end, ptr_end}); + } + + } else if (mem.end != ptr_end) { + + // shrink beginning of current free region to [end, ptr_end) + free_type::iterator next = iter; + ++next; + m_free_space.insert(next, free_value_type{mem.end, ptr_end}); + m_free_space.erase(iter); + + } else { + + // can not reuse current region, erase + m_free_space.erase(iter); + } + } + + void add_used_chunk(MemoryChunk mem) + { + // simply inserts a chunk of memory into used_space + m_used_space.insert(used_value_type{mem.begin, mem.end}); + } + + MemoryChunk m_allocation; + free_type m_free_space; + used_type m_used_space; +}; + +} /* end namespace detail */ + + +/*! \class AllocatorPool + ****************************************************************************** + * + * \brief AllocatorPool provides a a RAJA::Allocator that is a basic memory + * pool on top of the given allocator_type. + * + * This is used for RAJA's internal allocations by default, but a different + * Allocator can be set for specific backend allocators. For example + * RAJA::cuda::set_device_allocator allows the user to change the device + * allocator used by RAJA internally. + * + * AllocatorPool uses MemoryArena to do the heavy lifting of maintaining + * access to the used/free space. + * + * The following are some examples + * + * //! example allocator for AllocatorPool using allocate/deallocate + * struct host_allocator { + * + * // returns a valid pointer on success, nullptr on failure + * void* allocate(std::size_t nbytes) + * { + * return std::malloc(nbytes); + * } + * + * // returns true on success, false on failure + * void deallocate(void* ptr) + * { + * std::free(ptr); + * } + * }; + * + * RAJA::Allocator* aloc = + * new RAJA::AllocatorPool(); + * + ****************************************************************************** + */ +template +struct AllocatorPool : Allocator +{ + using allocator_type = allocator_t; + + static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull; + + AllocatorPool(std::string const& name, + allocator_type const& aloc = allocator_type{}, + size_t default_arena_size = default_default_arena_size) + : m_default_arena_size(default_arena_size) + , m_alloc(aloc) + , m_name(name) // std::string("RAJA::AllocatorPool<")+m_alloc.getName()+">") + { + } + + // not copyable or movable + AllocatorPool(AllocatorPool const&) = delete; + AllocatorPool(AllocatorPool &&) = delete; + AllocatorPool& operator=(AllocatorPool const&) = delete; + AllocatorPool& operator=(AllocatorPool &&) = delete; + + virtual ~AllocatorPool() + { + // When used with static storage duration, it is possible to encounter + // errors like cudaErrorCudartUnloading with cudaFree. So do not call + // release here to avoid potential cuda calls and errors. + } + + void* allocate(size_t nbytes, + size_t alignment = alignof(std::max_align_t)) final + { + if (nbytes == 0) return nullptr; + +#if defined(RAJA_ENABLE_OPENMP) + lock_guard lock(m_mutex); +#endif + + detail::MemoryChunk mem; + + // find a usable memory chunk in an existing arena + arena_container_type::iterator end = m_arenas.end(); + for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; + ++iter) { + mem = iter->get(nbytes, alignment); + if (mem.begin != nullptr) { + break; + } + } + + // allocate a new memory chunk + if (mem.begin == nullptr) { + const size_t alloc_size = + std::max(nbytes + alignment, m_default_arena_size); + detail::MemoryChunk arena_mem(m_alloc.allocate(alloc_size), alloc_size); + if (arena_mem.begin != nullptr) { + m_arenas.emplace_front(arena_mem); + m_actualSize += m_arenas.front().capacity(); + mem = m_arenas.front().get(nbytes, alignment); + } else{ + RAJA_ABORT_OR_THROW("RAJA::AllocatorPool::allocate arena allocation failed"); + } + } + + // update stats + m_currentSize += mem.nbytes(); + if (m_currentSize > m_highWatermark) { + m_highWatermark = m_currentSize; + } + m_allocationCount += 1; + + return mem.begin; + } + + void deallocate(void* ptr) final + { +#if defined(RAJA_ENABLE_OPENMP) + lock_guard lock(m_mutex); +#endif + + // find arena that owns ptr and return it + detail::MemoryChunk mem; + arena_container_type::iterator end = m_arenas.end(); + for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; + ++iter) { + if ( (mem = iter->give(ptr)) ) { + ptr = nullptr; + // update stats + m_currentSize -= mem.nbytes(); + m_allocationCount -= 1; + break; + } + } + + if (ptr != nullptr) { + RAJA_ABORT_OR_THROW("RAJA::AllocatorPool::deallocate unknown pointer"); + } + } + + void release() final + { +#if defined(RAJA_ENABLE_OPENMP) + lock_guard lock(m_mutex); +#endif + + for (auto it = m_arenas.begin(); it != m_arenas.end(); /* do nothing */) { + if (it->unused()) { + // update stats + m_actualSize -= it->capacity(); + // deallocate memory + detail::MemoryChunk mem = it->get_allocation(); + m_alloc.deallocate(mem.begin); + // erase + it = m_arenas.erase(it); + } else { + ++it; + } + } + } + + size_t get_arena_size() const + { +#if defined(RAJA_ENABLE_OPENMP) + lock_guard lock(m_mutex); +#endif + + return m_default_arena_size; + } + + size_t set_arena_size(size_t new_arena_size) + { +#if defined(RAJA_ENABLE_OPENMP) + lock_guard lock(m_mutex); +#endif + + size_t prev_size = m_default_arena_size; + m_default_arena_size = new_arena_size; + return prev_size; + } + + size_t getHighWatermark() const noexcept final + { + return m_highWatermark; + } + + size_t getCurrentSize() const noexcept final + { + return m_currentSize; + } + + size_t getActualSize() const noexcept final + { + return m_actualSize; + } + + size_t getAllocationCount() const noexcept final + { + return m_allocationCount; + } + + const std::string& getName() const noexcept final + { + return m_name; + } + + // Platform getPlatform() const noexcept final + // { + // return m_alloc.getPlatform(); + // } + +private: + using arena_container_type = std::list; + + arena_container_type m_arenas; + size_t m_default_arena_size; + allocator_t m_alloc; + std::string m_name; + + size_t m_highWatermark = 0; + size_t m_currentSize = 0; + size_t m_actualSize = 0; + size_t m_allocationCount = 0; + +#if defined(RAJA_ENABLE_OPENMP) + omp::mutex m_mutex; +#endif +}; + +} /* end namespace RAJA */ + + +#endif /* RAJA_UTIL_ALLOCATORPOOL_HPP */ diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp index 2c459c1683..b10587d84c 100644 --- a/include/RAJA/util/SoAPtr.hpp +++ b/include/RAJA/util/SoAPtr.hpp @@ -20,6 +20,8 @@ #include "RAJA/config.hpp" +#include "RAJA/util/Allocator.hpp" + // for RAJA::reduce::detail::ValueLoc #include "RAJA/pattern/detail/reduce.hpp" @@ -36,29 +38,27 @@ namespace detail * This is useful for creating a vectorizable data layout and getting * coalesced memory accesses or avoiding shared memory bank conflicts in cuda. */ -template > +template class SoAPtr { using value_type = T; public: SoAPtr() = default; - explicit SoAPtr(size_t size) - : mem(mempool::getInstance().template malloc(size)) + SoAPtr(Allocator& allocator, size_t size) + : mem(allocator.template allocate(size)) { } - SoAPtr& allocate(size_t size) + SoAPtr& allocate(Allocator& allocator, size_t size) { - mem = mempool::getInstance().template malloc(size); + mem = allocator.template allocate(size); return *this; } - SoAPtr& deallocate() + SoAPtr& deallocate(Allocator& allocator) { - mempool::getInstance().free(mem); + allocator.deallocate(mem); mem = nullptr; return *this; } @@ -75,8 +75,8 @@ class SoAPtr /*! * @brief Specialization for RAJA::reduce::detail::ValueLoc. */ -template -class SoAPtr, mempool> +template +class SoAPtr> { using value_type = RAJA::reduce::detail::ValueLoc; using first_type = T; @@ -84,24 +84,24 @@ class SoAPtr, mempool> public: SoAPtr() = default; - explicit SoAPtr(size_t size) - : mem(mempool::getInstance().template malloc(size)), - mem_idx(mempool::getInstance().template malloc(size)) + SoAPtr(Allocator& allocator, size_t size) + : mem(allocator.template allocate(size)), + mem_idx(allocator.template allocate(size)) { } - SoAPtr& allocate(size_t size) + SoAPtr& allocate(Allocator& allocator, size_t size) { - mem = mempool::getInstance().template malloc(size); - mem_idx = mempool::getInstance().template malloc(size); + mem = allocator.template allocate(size); + mem_idx = allocator.template allocate(size); return *this; } - SoAPtr& deallocate() + SoAPtr& deallocate(Allocator& allocator) { - mempool::getInstance().free(mem); + allocator.deallocate(mem); mem = nullptr; - mempool::getInstance().free(mem_idx); + allocator.deallocate(mem_idx); mem_idx = nullptr; return *this; } diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp deleted file mode 100644 index 517fb5a086..0000000000 --- a/include/RAJA/util/basic_mempool.hpp +++ /dev/null @@ -1,427 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief RAJA header file containing an implementation of a memory pool. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_BASIC_MEMPOOL_HPP -#define RAJA_BASIC_MEMPOOL_HPP - -#include -#include -#include -#include -#include -#include - -#include "RAJA/util/align.hpp" -#include "RAJA/util/mutex.hpp" - -namespace RAJA -{ - -namespace basic_mempool -{ - -namespace detail -{ - - -/*! \class MemoryArena - ****************************************************************************** - * - * \brief MemoryArena is a map based subclass for class MemPool - * provides book-keeping to divy a large chunk of pre-allocated memory to avoid - * the overhead of malloc/free or cudaMalloc/cudaFree, etc - * - * get/give are the primary calls used by class MemPool to get aligned memory - * from the pool or give it back - * - * - ****************************************************************************** - */ -class MemoryArena -{ -public: - using free_type = std::map; - using free_value_type = typename free_type::value_type; - using used_type = std::map; - using used_value_type = typename used_type::value_type; - - MemoryArena(void* ptr, size_t size) - : m_allocation{ ptr, static_cast(ptr)+size }, - m_free_space(), - m_used_space() - { - m_free_space[ptr] = static_cast(ptr)+size ; - if (m_allocation.begin == nullptr) { - fprintf(stderr, "Attempt to create MemoryArena with no memory"); - std::abort(); - } - } - - MemoryArena(MemoryArena const&) = delete; - MemoryArena& operator=(MemoryArena const&) = delete; - - MemoryArena(MemoryArena&&) = default; - MemoryArena& operator=(MemoryArena&&) = default; - - size_t capacity() - { - return static_cast(m_allocation.end) - - static_cast(m_allocation.begin); - } - - bool unused() { return m_used_space.empty(); } - - void* get_allocation() { return m_allocation.begin; } - - void* get(size_t nbytes, size_t alignment) - { - void* ptr_out = nullptr; - if (capacity() >= nbytes) { - free_type::iterator end = m_free_space.end(); - for (free_type::iterator iter = m_free_space.begin(); iter != end; - ++iter) { - - void* adj_ptr = iter->first; - size_t cap = - static_cast(iter->second) - static_cast(adj_ptr); - - if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) { - - ptr_out = adj_ptr; - - remove_free_chunk(iter, - adj_ptr, - static_cast(adj_ptr) + nbytes); - - add_used_chunk(adj_ptr, static_cast(adj_ptr) + nbytes); - - break; - } - } - } - return ptr_out; - } - - bool give(void* ptr) - { - if (m_allocation.begin <= ptr && ptr < m_allocation.end) { - - used_type::iterator found = m_used_space.find(ptr); - - if (found != m_used_space.end()) { - - add_free_chunk(found->first, found->second); - - m_used_space.erase(found); - - } else { - fprintf(stderr, "Invalid free %p", ptr); - std::abort(); - } - - return true; - } else { - return false; - } - } - -private: - struct memory_chunk { - void* begin; - void* end; - }; - - void add_free_chunk(void* begin, void* end) - { - // integrates a chunk of memory into free_space - free_type::iterator invl = m_free_space.end(); - free_type::iterator next = m_free_space.lower_bound(begin); - - // check if prev exists - if (next != m_free_space.begin()) { - // check if prev can cover [begin, end) - free_type::iterator prev = next; - --prev; - if (prev->second == begin) { - // extend prev to cover [begin, end) - prev->second = end; - - // check if prev can cover next too - if (next != invl) { - assert(next->first != begin); - - if (next->first == end) { - // extend prev to cover next too - prev->second = next->second; - - // remove redundant next - m_free_space.erase(next); - } - } - return; - } - } - - if (next != invl) { - assert(next->first != begin); - - if (next->first == end) { - // extend next to cover [begin, end) - m_free_space.insert(next, free_value_type{begin, next->second}); - m_free_space.erase(next); - - return; - } - } - - // no free space adjacent to this chunk, add seperate free chunk [begin, - // end) - m_free_space.insert(next, free_value_type{begin, end}); - } - - void remove_free_chunk(free_type::iterator iter, void* begin, void* end) - { - - void* ptr = iter->first; - void* ptr_end = iter->second; - - // fixup m_free_space, shrinking and adding chunks as needed - if (ptr != begin) { - - // shrink end of current free region to [ptr, begin) - iter->second = begin; - - if (end != ptr_end) { - - // insert free region [end, ptr_end) after current free region - free_type::iterator next = iter; - ++next; - m_free_space.insert(next, free_value_type{end, ptr_end}); - } - - } else if (end != ptr_end) { - - // shrink beginning of current free region to [end, ptr_end) - free_type::iterator next = iter; - ++next; - m_free_space.insert(next, free_value_type{end, ptr_end}); - m_free_space.erase(iter); - - } else { - - // can not reuse current region, erase - m_free_space.erase(iter); - } - } - - void add_used_chunk(void* begin, void* end) - { - // simply inserts a chunk of memory into used_space - m_used_space.insert(used_value_type{begin, end}); - } - - memory_chunk m_allocation; - free_type m_free_space; - used_type m_used_space; -}; - -} /* end namespace detail */ - - -/*! \class MemPool - ****************************************************************************** - * - * \brief MemPool pre-allocates a large chunk of memory and provides generic - * malloc/free for the user to allocate aligned data within the pool - * - * MemPool uses MemoryArena to do the heavy lifting of maintaining access to - * the used/free space. - * - * MemPool provides an example generic_allocator which can guide more - *specialized - * allocators. The following are some examples - * - * using device_mempool_type = basic_mempool::MemPool; - * using device_zeroed_mempool_type = - *basic_mempool::MemPool; - * using pinned_mempool_type = basic_mempool::MemPool; - * - * The user provides the specialized allocator, for example : - * struct DeviceAllocator { - * - * // returns a valid pointer on success, nullptr on failure - * void* malloc(size_t nbytes) - * { - * void* ptr; - * cudaErrchk(cudaMalloc(&ptr, nbytes)); - * return ptr; - * } - * - * // returns true on success, false on failure - * bool free(void* ptr) - * { - * cudaErrchk(cudaFree(ptr)); - * return true; - * } - * }; - * - * - ****************************************************************************** - */ -template -class MemPool -{ -public: - using allocator_type = allocator_t; - - static inline MemPool& getInstance() - { - static MemPool pool{}; - return pool; - } - - static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull; - - MemPool() - : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc() - { - } - - ~MemPool() - { - // With static objects like MemPool, cudaErrorCudartUnloading is a possible - // error with cudaFree - // So no more cuda calls here - } - - - void free_chunks() - { -#if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(m_mutex); -#endif - - while (!m_arenas.empty()) { - void* allocation_ptr = m_arenas.front().get_allocation(); - m_alloc.free(allocation_ptr); - m_arenas.pop_front(); - } - } - - size_t arena_size() - { -#if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(m_mutex); -#endif - - return m_default_arena_size; - } - - size_t arena_size(size_t new_size) - { -#if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(m_mutex); -#endif - - size_t prev_size = m_default_arena_size; - m_default_arena_size = new_size; - return prev_size; - } - - template - T* malloc(size_t nTs, size_t alignment = alignof(T)) - { -#if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(m_mutex); -#endif - - const size_t size = nTs * sizeof(T); - void* ptr = nullptr; - arena_container_type::iterator end = m_arenas.end(); - for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; - ++iter) { - ptr = iter->get(size, alignment); - if (ptr != nullptr) { - break; - } - } - - if (ptr == nullptr) { - const size_t alloc_size = - std::max(size + alignment, m_default_arena_size); - void* arena_ptr = m_alloc.malloc(alloc_size); - if (arena_ptr != nullptr) { - m_arenas.emplace_front(arena_ptr, alloc_size); - ptr = m_arenas.front().get(size, alignment); - } - } - - return static_cast(ptr); - } - - void free(const void* cptr) - { -#if defined(RAJA_ENABLE_OPENMP) - lock_guard lock(m_mutex); -#endif - - void* ptr = const_cast(cptr); - arena_container_type::iterator end = m_arenas.end(); - for (arena_container_type::iterator iter = m_arenas.begin(); iter != end; - ++iter) { - if (iter->give(ptr)) { - ptr = nullptr; - break; - } - } - if (ptr != nullptr) { - fprintf(stderr, "Unknown pointer %p", ptr); - } - } - -private: - using arena_container_type = std::list; - -#if defined(RAJA_ENABLE_OPENMP) - omp::mutex m_mutex; -#endif - - arena_container_type m_arenas; - size_t m_default_arena_size; - allocator_t m_alloc; -}; - -//! example allocator for basic_mempool using malloc/free -struct generic_allocator { - - // returns a valid pointer on success, nullptr on failure - void* malloc(size_t nbytes) { return std::malloc(nbytes); } - - // returns true on success, false on failure - bool free(void* ptr) - { - std::free(ptr); - return true; - } -}; - -} /* end namespace basic_mempool */ - -} /* end namespace RAJA */ - - -#endif /* RAJA_BASIC_MEMPOOL_HPP */ From b85f98c4282b51abb5b8fc5336c6ffea30c2346f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Oct 2021 23:20:46 -0700 Subject: [PATCH 2/7] Add missing include in tut_sort --- examples/tut_sort.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tut_sort.cpp b/examples/tut_sort.cpp index 3eeb5cc575..a856b7236a 100644 --- a/examples/tut_sort.cpp +++ b/examples/tut_sort.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include From 49b16e428f13fad72e81ad63db4e6005dbc3be9f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Oct 2021 23:47:25 -0700 Subject: [PATCH 3/7] Register and deregister Allocators from list --- include/RAJA/util/Allocator.hpp | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp index f0e226bfb7..93beed450d 100644 --- a/include/RAJA/util/Allocator.hpp +++ b/include/RAJA/util/Allocator.hpp @@ -27,6 +27,15 @@ namespace RAJA { +struct Allocator; + +namespace detail +{ + +inline std::vector& get_allocators(); + +} /* end namespace detail */ + /*! \class Allocator ****************************************************************************** * @@ -38,7 +47,10 @@ namespace RAJA */ struct Allocator { - Allocator() = default; + Allocator() + { + detail::get_allocators().emplace_back(this); + } // not copyable or movable Allocator(Allocator const&) = delete; @@ -46,7 +58,18 @@ struct Allocator Allocator& operator=(Allocator const&) = delete; Allocator& operator=(Allocator &&) = delete; - virtual ~Allocator() = default; + virtual ~Allocator() + { + auto& allocators = detail::get_allocators(); + for (auto iter = allocators.cbegin(); + iter != allocators.cend(); + ++iter) { + if (this == *iter) { + allocators.erase(iter); + break; + } + } + } virtual void* allocate(size_t nbytes, size_t alignment = alignof(std::max_align_t)) = 0; From 0c393cdeb26a3d44cce7c87c0409f85dc9574b89 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Oct 2021 23:47:45 -0700 Subject: [PATCH 4/7] Return allocators vector by const ref --- include/RAJA/util/Allocator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp index 93beed450d..921b816b8a 100644 --- a/include/RAJA/util/Allocator.hpp +++ b/include/RAJA/util/Allocator.hpp @@ -112,7 +112,7 @@ inline std::vector& get_allocators() /*! * \brief Get the set of allocators used by RAJA internally */ -inline std::vector get_allocators() +inline std::vector const& get_allocators() { return detail::get_allocators(); } From 0287c8efa77f6e2456f20706c4cb986fc0b5cf2a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 12 Oct 2021 23:47:59 -0700 Subject: [PATCH 5/7] Fix compile error in AllocatorPool --- include/RAJA/util/AllocatorPool.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp index 1c8405723a..e12dce00da 100644 --- a/include/RAJA/util/AllocatorPool.hpp +++ b/include/RAJA/util/AllocatorPool.hpp @@ -444,7 +444,7 @@ struct AllocatorPool : Allocator } } - size_t get_arena_size() const + size_t get_arena_size() { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); From e5d6acab61220d534ea0127e600bccabaf5724b8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 29 Oct 2021 13:40:07 -0700 Subject: [PATCH 6/7] Add AllocatorWithStats This is a layer under Allocator that can provide stats. The idea is to allow users to expose stats for their allocators or not. --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 3 +++ include/RAJA/policy/hip/MemUtils_HIP.hpp | 3 +++ include/RAJA/util/Allocator.hpp | 28 +++++++++++++++++++--- include/RAJA/util/AllocatorPool.hpp | 13 ++++++---- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index b7275d044c..fb74e35680 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -107,6 +107,7 @@ inline std::unique_ptr make_default_device_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::cuda::default_device_allocator"), + Platform::cuda, DeviceBaseAllocator())); } /// @@ -115,6 +116,7 @@ inline std::unique_ptr make_default_device_zeroed_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::cuda::default_device_zeroed_allocator"), + Platform::cuda, DeviceZeroedBaseAllocator())); } /// @@ -123,6 +125,7 @@ inline std::unique_ptr make_default_pinned_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::cuda::default_pinned_allocator"), + Platform::cuda, PinnedBaseAllocator())); } diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 5f32b83b94..34a1894926 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -110,6 +110,7 @@ inline std::unique_ptr make_default_device_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::hip::default_device_allocator"), + Platform::hip, DeviceBaseAllocator())); } /// @@ -118,6 +119,7 @@ inline std::unique_ptr make_default_device_zeroed_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::hip::default_device_zeroed_allocator"), + Platform::hip, DeviceZeroedBaseAllocator())); } /// @@ -126,6 +128,7 @@ inline std::unique_ptr make_default_pinned_allocator() return std::unique_ptr( new AllocatorPool( std::string("RAJA::hip::default_pinned_allocator"), + Platform::hip, PinnedBaseAllocator())); } diff --git a/include/RAJA/util/Allocator.hpp b/include/RAJA/util/Allocator.hpp index 921b816b8a..38b525f637 100644 --- a/include/RAJA/util/Allocator.hpp +++ b/include/RAJA/util/Allocator.hpp @@ -85,6 +85,30 @@ struct Allocator virtual void release() = 0; + virtual const std::string& getName() const noexcept = 0; +}; + + +/*! \class AllocatorWithStats + ****************************************************************************** + * + * \brief AllocatorWithStats Provides a generic interface for allocation and + * getting allocation statistics in RAJA + * + ****************************************************************************** + */ +struct AllocatorWithStats : Allocator +{ + AllocatorWithStats() = default; + + // not copyable or movable + AllocatorWithStats(AllocatorWithStats const&) = delete; + AllocatorWithStats(AllocatorWithStats &&) = delete; + AllocatorWithStats& operator=(AllocatorWithStats const&) = delete; + AllocatorWithStats& operator=(AllocatorWithStats &&) = delete; + + virtual ~AllocatorWithStats() = default; + virtual size_t getHighWatermark() const noexcept = 0; virtual size_t getCurrentSize() const noexcept = 0; @@ -93,9 +117,7 @@ struct Allocator virtual size_t getAllocationCount() const noexcept = 0; - virtual const std::string& getName() const noexcept = 0; - - // virtual Platform getPlatform() const noexcept = 0; + virtual Platform getPlatform() const noexcept = 0; }; namespace detail diff --git a/include/RAJA/util/AllocatorPool.hpp b/include/RAJA/util/AllocatorPool.hpp index e12dce00da..a75d5b80ad 100644 --- a/include/RAJA/util/AllocatorPool.hpp +++ b/include/RAJA/util/AllocatorPool.hpp @@ -325,18 +325,20 @@ struct MemoryArena ****************************************************************************** */ template -struct AllocatorPool : Allocator +struct AllocatorPool : AllocatorWithStats { using allocator_type = allocator_t; static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull; AllocatorPool(std::string const& name, + Platform platform, allocator_type const& aloc = allocator_type{}, size_t default_arena_size = default_default_arena_size) : m_default_arena_size(default_arena_size) , m_alloc(aloc) , m_name(name) // std::string("RAJA::AllocatorPool<")+m_alloc.getName()+">") + , m_platform(platform) { } @@ -489,10 +491,10 @@ struct AllocatorPool : Allocator return m_name; } - // Platform getPlatform() const noexcept final - // { - // return m_alloc.getPlatform(); - // } + Platform getPlatform() const noexcept final + { + return m_platform; + } private: using arena_container_type = std::list; @@ -501,6 +503,7 @@ struct AllocatorPool : Allocator size_t m_default_arena_size; allocator_t m_alloc; std::string m_name; + Platform m_platform; size_t m_highWatermark = 0; size_t m_currentSize = 0; From 58d4bcdf6978fb1b8c960ed4b076c7f9bf8f5272 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Nov 2021 16:45:35 -0800 Subject: [PATCH 7/7] Add basic example --- examples/CMakeLists.txt | 4 + examples/tut_allocators.cpp | 340 ++++++++++++++++++++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 examples/tut_allocators.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c7cbd1bd47..2ef0a0ed04 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -139,4 +139,8 @@ raja_add_executable( NAME resource-teams SOURCES resource-teams.cpp) +raja_add_executable( + NAME tut_allocators + SOURCES tut_allocators.cpp) + add_subdirectory(plugin) diff --git a/examples/tut_allocators.cpp b/examples/tut_allocators.cpp new file mode 100644 index 0000000000..87cac9ec32 --- /dev/null +++ b/examples/tut_allocators.cpp @@ -0,0 +1,340 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Reduction Example + * + * This example illustrates use of the RAJA Allocators. + * + * RAJA features shown: + * - Allocator types + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * - Reduction types + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +const int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +const int HIP_BLOCK_SIZE = 256; +#endif + + +// Allocator class derived from RAJA::Allocator +// using memoryManager allocations. +struct ExampleAllocator : RAJA::Allocator +{ + // Allocators may take any constructor args as they are passed through the + // RAJA::*::set_*_allocator calls to the constructor. + ExampleAllocator(const std::string& name) + : m_name(name) + { + std::cout << "\t\t" << getName() << " constructor" << std::endl; + } + + // Virtual destructor. + // Care should be taken as this may be called after main has returned. + virtual ~ExampleAllocator() + { + std::cout << "\t\t" << getName() << " destructor" << std::endl; + } + + // Override the allocate method. + void* allocate(size_t nbytes, + size_t alignment) override + { + std::cout << "\t\t" << getName() << " allocate nbytes " << nbytes + << " alignment " << alignment << std::endl; + void* ptr = memoryManager::allocate(nbytes); + std::cout << "\t\t" << getName() << " ptr " << ptr << std::endl; + return ptr; + } + + // Override the deallocate method. + void deallocate(void* ptr) override + { + std::cout << "\t\t" << getName() << " deallocate ptr " << ptr << std::endl; + memoryManager::deallocate(ptr); + } + + // Override the release method. + // Used to release memory held by things like pool allocators. + // Called on the old allocator when changing allocators via + // RAJA::*::set_*_allocator and RAJA::*::reset_*_allocator calls. + void release() override + { + std::cout << "\t\t" << getName() << " release" << std::endl; + } + + // override the getName method + const std::string& getName() const noexcept override + { + return m_name; + } +private: + std::string m_name; +}; + + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + +// Allocator class derived from RAJA::Allocator +// using memoryManager gpu allocations. +struct ExampleAllocatorGPU : RAJA::Allocator +{ + // Allocators may take any constructor args as they are passed through the + // RAJA::*::set_*_allocator calls to the constructor. + ExampleAllocatorGPU(const std::string& name) + : m_name(name) + { + std::cout << "\t\t" << getName() << " constructor" << std::endl; + } + + // Virtual destructor. + // Care should be taken as this may be called after main has returned. + virtual ~ExampleAllocatorGPU() + { + std::cout << "\t\t" << getName() << " destructor" << std::endl; + } + + // Override the allocate method. + void* allocate(size_t nbytes, + size_t alignment) override + { + std::cout << "\t\t" << getName() << " allocate nbytes " << nbytes + << " alignment " << alignment << std::endl; + void* ptr = memoryManager::allocate_gpu(nbytes); + std::cout << "\t\t" << getName() << " ptr " << ptr << std::endl; + return ptr; + } + + // Override the deallocate method. + void deallocate(void* ptr) override + { + std::cout << "\t\t" << getName() << " deallocate ptr " << ptr << std::endl; + memoryManager::deallocate_gpu(ptr); + } + + // Override the release method. + // Used to release memory held by things like pool allocators. + // Called on the old allocator when changing allocators via + // RAJA::*::set_*_allocator and RAJA::*::reset_*_allocator calls. + void release() override + { + std::cout << "\t\t" << getName() << " release" << std::endl; + } + + // override the getName method + const std::string& getName() const noexcept override + { + return m_name; + } +private: + std::string m_name; +}; + +#endif + + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA allocators example...\n"; + +// +// Define array length +// + const int N = 1000000; + +// +// Allocate array data and initialize data to alternating sequence of 1, -1. +// + int* a = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + if ( i % 2 == 0 ) { + a[i] = 1; + } else { + a[i] = -1; + } + } + +// +// Note: with this data initialization scheme, the following results will +// be observed for all reduction kernels below: +// +// - the sum will be zero +// + +// +// Define index range for iterating over a elements in all examples +// + RAJA::RangeSegment arange(0, N); + + +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA sequential reduction...\n"; + + using EXEC_POL1 = RAJA::seq_exec; + using REDUCE_POL1 = RAJA::seq_reduce; + + RAJA::ReduceSum seq_sum(0); + + RAJA::forall(arange, [=](int i) { + + seq_sum += a[i]; + + }); + + std::cout << "\tsum = " << seq_sum.get() << std::endl; + } + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + { + std::cout << "\n Running RAJA OpenMP reduction...\n"; + + using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using REDUCE_POL2 = RAJA::omp_reduce; + + RAJA::ReduceSum omp_sum(0); + + RAJA::forall(arange, [=](int i) { + + omp_sum += a[i]; + + }); + + std::cout << "\tsum = " << omp_sum.get() << std::endl; + } +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + { + std::cout << "\n Setting RAJA CUDA device allocator...\n"; + + RAJA::cuda::set_device_allocator("CUDA_ExampleAllocatorGPU"); + + std::cout << "\n Getting RAJA CUDA device allocator...\n"; + + RAJA::Allocator& cuda_device_allocator = RAJA::cuda::get_device_allocator(); + + std::cout << "\n Got RAJA CUDA device allocator " << cuda_device_allocator.getName() << "...\n"; + + { + std::cout << "\n Running RAJA CUDA reduction...\n"; + + using EXEC_POL3 = RAJA::cuda_exec; + using REDUCE_POL3 = RAJA::cuda_reduce; + + std::cout << "\n Constructing RAJA CUDA reduction object...\n"; + RAJA::ReduceSum cuda_sum(0); + + std::cout << "\n Running RAJA CUDA reduction kernel...\n"; + RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + + cuda_sum += a[i]; + + }); + + + std::cout << "\n Getting RAJA CUDA reduction result...\n"; + + int result = cuda_sum.get(); + + std::cout << "\tsum = " << result << std::endl; + } + + std::cout << "\n Resetting RAJA CUDA device allocator...\n"; + + RAJA::cuda::reset_device_allocator(); + + std::cout << "\n Done with RAJA CUDA device allocator...\n"; + } +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + { + std::cout << "\n Setting RAJA HIP device allocator...\n"; + + RAJA::hip::set_device_allocator("HIP_ExampleAllocatorGPU"); + + std::cout << "\n Getting RAJA HIP device allocator...\n"; + + RAJA::Allocator& hip_device_allocator = RAJA::hip::get_device_allocator(); + + std::cout << "\n Got RAJA HIP device allocator " << hip_device_allocator.getName() << "...\n"; + + { + int* d_a = memoryManager::allocate_gpu(N); + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + + using EXEC_POL3 = RAJA::hip_exec; + using REDUCE_POL3 = RAJA::hip_reduce; + + std::cout << "\n Constructing RAJA HIP reduction object...\n"; + RAJA::ReduceSum hip_sum(0); + + std::cout << "\n Running RAJA HIP reduction kernel...\n"; + RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + + hip_sum += d_a[i]; + + }); + + std::cout << "\n Getting RAJA HIP reduction result...\n"; + + int result = hip_sum.get(); + + std::cout << "\tsum = " << result << std::endl; + + memoryManager::deallocate_gpu(d_a); + } + + std::cout << "\n Resetting RAJA HIP device allocator...\n"; + + RAJA::hip::reset_device_allocator(); + + std::cout << "\n Done with RAJA HIP device allocator...\n"; + } +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(a); + + std::cout << "\n DONE!...\n"; + + return 0; +}